From ce5172c899ff0dc7e413221a12ff666f39ed0121 Mon Sep 17 00:00:00 2001 From: Vikram Auradkar Date: Fri, 15 Jul 2022 19:21:42 +0000 Subject: [PATCH] crosvm: upstream windows src - Upstreams all windows specific files in src/ - Adds windows specific args to Config/Command parsing. - Adds noop anti tamper crate. There are still some deltas between upstream and downstream src because of moving HEAD in upstream and some code refactors downstream. But this is most of the code. BUG=b:213146388 TEST=built on windows downstream. upstream crosvm does not build on windows yet because of to-be-upstreamed dependency crates. presubmit. Change-Id: I3445975749f8108ae51d5fb6e1c2f1447439e1fb Reviewed-on: https://chromium-review.googlesource.com/c/crosvm/crosvm/+/3765346 Commit-Queue: Vikram Auradkar Auto-Submit: Vikram Auradkar Tested-by: Vikram Auradkar Reviewed-by: Daniel Verkamp --- Cargo.toml | 1 + anti_tamper/Cargo.toml | 14 + anti_tamper/src/lib.rs | 7 + anti_tamper/src/noop.rs | 35 + src/crosvm/cmdline.rs | 93 ++ src/crosvm/config.rs | 92 +- src/crosvm/sys.rs | 12 +- src/crosvm/sys/unix/config.rs | 24 +- src/crosvm/sys/windows.rs | 10 + src/crosvm/sys/windows/broker.rs | 1747 ++++++++++++++++++++++++ src/crosvm/sys/windows/cmdline.rs | 85 ++ src/crosvm/sys/windows/config.rs | 822 ++++++++++++ src/crosvm/sys/windows/exit.rs | 489 +++++++ src/crosvm/sys/windows/stats.rs | 241 ++++ src/main.rs | 37 +- src/sys.rs | 12 +- src/sys/unix/main.rs | 10 +- src/sys/windows.rs | 2078 +++++++++++++++++++++++++++++ src/sys/windows/irq_wait.rs | 364 +++++ src/sys/windows/main.rs | 247 ++++ src/sys/windows/metrics.rs | 93 ++ src/sys/windows/panic_hook.rs | 26 + src/sys/windows/run_vcpu.rs | 922 +++++++++++++ 23 files changed, 7446 insertions(+), 15 deletions(-) create mode 100644 anti_tamper/Cargo.toml create mode 100644 anti_tamper/src/lib.rs create mode 100644 anti_tamper/src/noop.rs create mode 100644 src/crosvm/sys/windows.rs create mode 100644 src/crosvm/sys/windows/broker.rs create mode 100644 src/crosvm/sys/windows/cmdline.rs create mode 100644 src/crosvm/sys/windows/config.rs create mode 100644 src/crosvm/sys/windows/exit.rs create mode 100644 src/crosvm/sys/windows/stats.rs create mode 100644 src/sys/windows.rs create mode 100644 src/sys/windows/irq_wait.rs create mode 100644 src/sys/windows/main.rs create mode 100644 src/sys/windows/metrics.rs create mode 100644 src/sys/windows/panic_hook.rs create mode 100644 src/sys/windows/run_vcpu.rs diff --git a/Cargo.toml b/Cargo.toml index 1ca956270a..a886090bcd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ lto = true members = [ "aarch64", "acpi_tables", + "anti_tamper", "arch", "argh_helpers", "base", diff --git a/anti_tamper/Cargo.toml b/anti_tamper/Cargo.toml new file mode 100644 index 0000000000..0e85191a20 --- /dev/null +++ b/anti_tamper/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "anti_tamper" +version = "0.1.0" +authors = ["The Chromium OS Authors"] +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +anti-tamper = [] +proto-tube-hack = [] + +[dependencies] +base = { path = "../base" } diff --git a/anti_tamper/src/lib.rs b/anti_tamper/src/lib.rs new file mode 100644 index 0000000000..e091eb634b --- /dev/null +++ b/anti_tamper/src/lib.rs @@ -0,0 +1,7 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +pub mod noop; + +pub use noop::*; diff --git a/anti_tamper/src/noop.rs b/anti_tamper/src/noop.rs new file mode 100644 index 0000000000..bee5b73ae3 --- /dev/null +++ b/anti_tamper/src/noop.rs @@ -0,0 +1,35 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::thread; + +use base::Tube; + +pub fn setup_common_metric_invariants( + _product_version: &Option, + _product_channel: &Option, + _use_vulkan: &Option, +) { +} + +#[cfg(feature = "proto-tube-hack")] +pub fn forward_security_challenge(_recv: &Tube, _sender: &Tube) {} + +#[cfg(feature = "proto-tube-hack")] +pub fn forward_security_signal(_recv: &Tube, _sender: &Tube) {} + +pub fn enable_vcpu_monitoring() -> bool { + false +} + +// This is a hard limit as it is used to set the Tube buffer size, and will +// deadlock if exceeded (b/223807352). +pub const MAX_CHALLENGE_SIZE: usize = 1; + +pub fn spawn_dedicated_anti_tamper_thread( + #[cfg(not(feature = "proto-tube-hack"))] _tube_to_main_thread: Tube, + #[cfg(feature = "proto-tube-hack")] _tube_to_main_thread: base::Tube, +) -> thread::JoinHandle<()> { + thread::spawn(move || {}) +} diff --git a/src/crosvm/cmdline.rs b/src/crosvm/cmdline.rs index 7321d21b2d..eb3ca571d3 100644 --- a/src/crosvm/cmdline.rs +++ b/src/crosvm/cmdline.rs @@ -17,6 +17,9 @@ cfg_if::cfg_if! { parse_coiommu_params, VfioCommand, parse_vfio, parse_vfio_platform, }; use super::config::SharedDir; + } else if #[cfg(windows)] { + use crate::crosvm::sys::config::IrqChipKind; + } } @@ -514,6 +517,10 @@ pub struct RunCommand { /// num_input_streams=INT - Set number of input PCM streams /// per device. pub cras_snds: Vec, + #[cfg(feature = "crash-report")] + #[argh(option, long = "crash-pipe-name", arg_name = "\\\\.\\pipe\\PIPE_NAME")] + /// the crash handler ipc pipe name. + pub crash_pipe_name: Option, #[argh(switch)] /// don't set VCPUs real-time until make-rt command is run pub delay_rt: bool, @@ -584,6 +591,10 @@ pub struct RunCommand { #[argh(positional, arg_name = "KERNEL")] /// bzImage of kernel to run pub executable_path: Option, + #[cfg(windows)] + #[argh(switch, long = "exit-stats")] + /// gather and display statistics on Vm Exits and Bus Reads/Writes. + pub exit_stats: bool, #[argh( option, long = "file-backed-mapping", @@ -674,6 +685,10 @@ pub struct RunCommand { #[argh(switch)] /// use mirror cpu topology of Host for Guest VM, also copy some cpu feature to Guest VM pub host_cpu_topology: bool, + #[cfg(windows)] + #[argh(option, long = "host-guid", arg_name = "PATH")] + /// string representation of the host guid in registry format, for namespacing vsock connections. + pub host_guid: Option, #[cfg(unix)] #[argh(option, arg_name = "IP")] /// IP address to assign to host tap interface @@ -687,9 +702,17 @@ pub struct RunCommand { #[argh(option, short = 'i', long = "initrd", arg_name = "PATH")] /// initial ramdisk to load pub initrd_path: Option, + #[cfg(windows)] + #[argh(option, long = "irqchip", arg_name = "kernel|split|userspace")] + /// type of interrupt controller emulation. \"split\" is only available for x86 KVM. + pub irq_chip: Option, #[argh(switch)] /// allow to enable ITMT scheduling feature in VM. The success of enabling depends on HWP and ACPI CPPC support on hardware pub itmt: bool, + #[cfg(windows)] + #[argh(option, long = "kernel-log-file", arg_name = "PATH")] + /// forward hypervisor kernel driver logs for this VM to a file. + pub kernel_log_file: Option, #[cfg(unix)] #[argh(option, long = "kvm-device", arg_name = "PATH")] /// path to the KVM device. (default /dev/kvm) @@ -698,6 +721,14 @@ pub struct RunCommand { #[argh(switch)] /// disable host swap on guest VM pages. pub lock_guest_memory: bool, + #[cfg(windows)] + #[argh(option, long = "log-file", arg_name = "PATH")] + /// redirect logs to the supplied log file at PATH rather than stderr. For multi-process mode, use --logs-directory instead + pub log_file: Option, + #[cfg(windows)] + #[argh(option, long = "logs-directory", arg_name = "PATH")] + /// path to the logs directory used for crosvm processes. Logs will be sent to stderr if unset, and stderr/stdout will be uncaptured + pub logs_directory: Option, #[cfg(unix)] #[argh(option, arg_name = "MAC", long = "mac")] /// MAC address for VM @@ -802,6 +833,26 @@ pub struct RunCommand { #[argh(switch)] /// grant this Guest VM certian privileges to manage Host resources, such as power management pub privileged_vm: bool, + #[cfg(feature = "process-invariants")] + #[argh(option, long = "process-invariants-handle", arg_name = "PATH")] + /// shared read-only memory address for a serialized EmulatorProcessInvariants proto + pub process_invariants_data_handle: Option, + #[cfg(feature = "process-invariants")] + #[argh(option, long = "process-invariants-size", arg_name = "PATH")] + /// size of the serialized EmulatorProcessInvariants proto pointed at by process-invariants-handle + pub process_invariants_data_size: Option, + #[cfg(windows)] + #[argh(option, long = "product-channel")] + /// product channel + pub product_channel: Option, + #[cfg(feature = "crash-report")] + #[argh(option, long = "product-name")] + /// the product name for file paths. + pub product_name: Option, + #[cfg(windows)] + #[argh(option, long = "product-version")] + /// product version + pub product_version: Option, #[argh(switch)] /// prevent host access to guest memory pub protected_vm: bool, @@ -812,6 +863,10 @@ pub struct RunCommand { /// path to pstore buffer backend file followed by size /// [--pstore ] pub pstore: Option, + #[cfg(windows)] + #[argh(switch)] + /// enable virtio-pvclock. + pub pvclock: bool, // Must be `Some` iff `protected_vm == ProtectionType::UnprotectedWithFirmware`. #[argh(option, long = "unprotected-vm-with-firmware", arg_name = "PATH")] /// (EXPERIMENTAL/FOR DEBUGGING) Use VM firmware, but allow host access to guest memory @@ -913,6 +968,10 @@ pub struct RunCommand { /// Can only be given once. Will default to first serial /// port if not provided. pub serial_parameters: Vec, + #[cfg(feature = "kiwi")] + #[argh(option, long = "service-pipe-name", arg_name = "PIPE_NAME")] + /// the service ipc pipe name. (Prefix \\\\.\\pipe\\ not needed. + pub service_pipe_name: Option, #[cfg(unix)] #[argh( option, @@ -966,6 +1025,10 @@ pub struct RunCommand { /// when the underlying file system supports POSIX ACLs. /// The default value for this option is "true". pub shared_dirs: Vec, + #[cfg(feature = "slirp-ring-capture")] + #[argh(option, long = "slirp-capture-file", arg_name = "PATH")] + /// Redirects slirp network packets to the supplied log file rather than the current directory as `slirp_capture_packets.pcap` + pub slirp_capture_file: Option, #[argh(option, short = 's', long = "socket", arg_name = "PATH")] /// path to put the control socket. If PATH is a directory, a name will be generated pub socket_path: Option, @@ -1391,6 +1454,36 @@ impl TryFrom for super::config::Config { cfg.pmem_devices.push(pmem); } + #[cfg(windows)] + { + #[cfg(feature = "crash-report")] + { + cfg.product_name = cmd.product_name; + + cfg.crash_pipe_name = cmd.crash_pipe_name; + } + cfg.exit_stats = cmd.exit_stats; + cfg.host_guid = cmd.host_guid; + cfg.irq_chip = cmd.irq_chip; + cfg.kernel_log_file = cmd.kernel_log_file; + cfg.log_file = cmd.log_file; + cfg.logs_directory = cmd.logs_directory; + #[cfg(feature = "process-invariants")] + { + cfg.process_invariants_data_handle = cmd.process_invariants_data_handle; + + cfg.process_invariants_data_size = cmd.process_invariants_data_size; + } + cfg.pvclock = cmd.pvclock; + cfg.service_pipe_name = cmd.service_pipe_name; + #[cfg(feature = "slirp-ring-capture")] + { + cfg.slirp_capture_file = cmd.slirp_capture_file; + } + cfg.syslog_tag = cmd.syslog_tag; + cfg.product_channel = cmd.product_channel; + cfg.product_version = cmd.product_version; + } cfg.pstore = cmd.pstore; #[cfg(unix)] diff --git a/src/crosvm/config.rs b/src/crosvm/config.rs index cce2cfc8cb..40d7c8e863 100644 --- a/src/crosvm/config.rs +++ b/src/crosvm/config.rs @@ -35,7 +35,7 @@ use x86_64::{set_enable_pnp_data_msr_config, set_itmt_msr_config}; #[cfg(feature = "audio")] use devices::{Ac97Backend, Ac97Parameters}; -use super::{argument::parse_hex_or_decimal, check_opt_path}; +use super::{argument::parse_hex_or_decimal, check_opt_path, sys::HypervisorKind}; cfg_if::cfg_if! { if #[cfg(unix)] { @@ -49,6 +49,10 @@ cfg_if::cfg_if! { static KVM_PATH: &str = "/dev/kvm"; static VHOST_NET_PATH: &str = "/dev/vhost-net"; static SECCOMP_POLICY_DIR: &str = "/usr/share/policy/crosvm"; + } else if #[cfg(windows)] { + use base::{Event, Tube}; + + use crate::crosvm::sys::windows::config::IrqChipKind; } } @@ -1236,11 +1240,21 @@ pub struct Config { pub balloon_bias: i64, pub balloon_control: Option, pub battery_type: Option, + #[cfg(windows)] + pub block_control_tube: Vec, + #[cfg(windows)] + pub block_vhost_user_tube: Vec, + #[cfg(windows)] + pub broker_shutdown_event: Option, pub cid: Option, #[cfg(unix)] pub coiommu_param: Option, pub cpu_capacity: BTreeMap, // CPU index -> capacity pub cpu_clusters: Vec>, + #[cfg(feature = "crash-report")] + pub crash_pipe_name: Option, + #[cfg(feature = "crash-report")] + pub crash_report_uuid: Option, pub delay_rt: bool, #[cfg(feature = "direct")] pub direct_edge_irq: Vec, @@ -1259,6 +1273,8 @@ pub struct Config { pub dmi_path: Option, pub enable_pnp_data: bool, pub executable_path: Option, + #[cfg(windows)] + pub exit_stats: bool, pub file_backed_mappings: Vec, pub force_calibrated_tsc_leaf: bool, pub force_s2idle: bool, @@ -1269,20 +1285,33 @@ pub struct Config { #[cfg(all(unix, feature = "gpu"))] pub gpu_render_server_parameters: Option, pub host_cpu_topology: bool, + #[cfg(windows)] + pub host_guid: Option, pub host_ip: Option, pub hugepages: bool, + pub hypervisor: Option, pub init_memory: Option, pub initrd_path: Option, + #[cfg(windows)] + pub irq_chip: Option, pub itmt: bool, pub jail_config: Option, + #[cfg(windows)] + pub kernel_log_file: Option, #[cfg(unix)] pub kvm_device_path: PathBuf, #[cfg(unix)] pub lock_guest_memory: bool, + #[cfg(windows)] + pub log_file: Option, + #[cfg(windows)] + pub logs_directory: Option, pub mac_address: Option, pub memory: Option, pub memory_file: Option, pub mmio_address_ranges: Vec, + #[cfg(windows)] + pub net_vhost_user_tube: Option, pub net_vq_pairs: Option, pub netmask: Option, pub no_i8042: bool, @@ -1302,17 +1331,33 @@ pub struct Config { pub plugin_root: Option, pub pmem_devices: Vec, pub privileged_vm: bool, + #[cfg(feature = "process-invariants")] + pub process_invariants_data_handle: Option, + #[cfg(feature = "process-invariants")] + pub process_invariants_data_size: Option, + #[cfg(feature = "crash-report")] + pub product_channel: Option, + #[cfg(windows)] + pub product_name: Option, + #[cfg(windows)] + pub product_version: Option, pub protected_vm: ProtectionType, pub pstore: Option, + #[cfg(windows)] + pub pvclock: bool, /// Must be `Some` iff `protected_vm == ProtectionType::UnprotectedWithFirmware`. pub pvm_fw: Option, pub rng: bool, pub rt_cpus: Vec, #[serde(with = "serde_serial_params")] pub serial_parameters: BTreeMap<(SerialHardware, u8), SerialParameters>, + #[cfg(feature = "kiwi")] + pub service_pipe_name: Option, #[cfg(unix)] #[serde(skip)] pub shared_dirs: Vec, + #[cfg(feature = "slirp-ring-capture")] + pub slirp_capture_file: Option, pub socket_path: Option, #[cfg(feature = "tpm")] pub software_tpm: bool, @@ -1322,6 +1367,8 @@ pub struct Config { pub strict_balloon: bool, pub stub_pci_devices: Vec, pub swiotlb: Option, + #[cfg(windows)] + pub syslog_tag: Option, #[cfg(unix)] pub tap_fd: Vec, pub tap_name: Vec, @@ -1381,9 +1428,19 @@ impl Default for Config { balloon_bias: 0, balloon_control: None, battery_type: None, + #[cfg(windows)] + block_control_tube: Vec::new(), + #[cfg(windows)] + block_vhost_user_tube: Vec::new(), + #[cfg(windows)] + broker_shutdown_event: None, cid: None, #[cfg(unix)] coiommu_param: None, + #[cfg(feature = "crash-report")] + crash_pipe_name: None, + #[cfg(feature = "crash-report")] + crash_report_uuid: None, cpu_capacity: BTreeMap::new(), cpu_clusters: Vec::new(), delay_rt: false, @@ -1404,6 +1461,8 @@ impl Default for Config { dmi_path: None, enable_pnp_data: false, executable_path: None, + #[cfg(windows)] + exit_stats: false, file_backed_mappings: Vec::new(), force_calibrated_tsc_leaf: false, force_s2idle: false, @@ -1414,24 +1473,41 @@ impl Default for Config { #[cfg(all(unix, feature = "gpu"))] gpu_render_server_parameters: None, host_cpu_topology: false, + #[cfg(windows)] + host_guid: None, host_ip: None, + #[cfg(windows)] + product_version: None, + #[cfg(windows)] + product_channel: None, hugepages: false, + hypervisor: None, init_memory: None, initrd_path: None, + #[cfg(windows)] + irq_chip: None, itmt: false, jail_config: if !cfg!(feature = "default-no-sandbox") { Some(Default::default()) } else { None }, + #[cfg(windows)] + kernel_log_file: None, #[cfg(unix)] kvm_device_path: PathBuf::from(KVM_PATH), #[cfg(unix)] lock_guest_memory: false, + #[cfg(windows)] + log_file: None, + #[cfg(windows)] + logs_directory: None, mac_address: None, memory: None, memory_file: None, mmio_address_ranges: Vec::new(), + #[cfg(windows)] + net_vhost_user_tube: None, net_vq_pairs: None, netmask: None, no_i8042: false, @@ -1451,14 +1527,26 @@ impl Default for Config { plugin_root: None, pmem_devices: Vec::new(), privileged_vm: false, + #[cfg(feature = "process-invariants")] + process_invariants_data_handle: None, + #[cfg(feature = "process-invariants")] + process_invariants_data_size: None, + #[cfg(feature = "crash-report")] + product_name: None, protected_vm: ProtectionType::Unprotected, pstore: None, + #[cfg(windows)] + pvclock: false, pvm_fw: None, rng: true, rt_cpus: Vec::new(), serial_parameters: BTreeMap::new(), + #[cfg(feature = "kiwi")] + service_pipe_name: None, #[cfg(unix)] shared_dirs: Vec::new(), + #[cfg(feature = "slirp-ring-capture")] + slirp_capture_file: None, socket_path: None, #[cfg(feature = "tpm")] software_tpm: false, @@ -1468,6 +1556,8 @@ impl Default for Config { strict_balloon: false, stub_pci_devices: Vec::new(), swiotlb: None, + #[cfg(windows)] + syslog_tag: None, #[cfg(unix)] tap_fd: Vec::new(), tap_name: Vec::new(), diff --git a/src/crosvm/sys.rs b/src/crosvm/sys.rs index 1a855388da..2ade3d9c99 100644 --- a/src/crosvm/sys.rs +++ b/src/crosvm/sys.rs @@ -2,11 +2,18 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#[cfg(unix)] +pub(crate) mod unix; + +#[cfg(windows)] +pub(crate) mod windows; + cfg_if::cfg_if! { if #[cfg(unix)] { - pub(crate) mod unix; use unix as platform; pub(crate) use unix::*; + } else if #[cfg(windows)] { + use windows as platform; } else { compile_error!("Unsupported platform"); } @@ -15,5 +22,8 @@ cfg_if::cfg_if! { pub(crate) use platform::cmdline; pub(crate) use platform::config; +#[cfg(feature = "crash-report")] +pub(crate) use platform::broker::setup_emulator_crash_reporting; #[cfg(feature = "gpu")] pub(crate) use platform::config::validate_gpu_config; +pub(crate) use platform::config::HypervisorKind; diff --git a/src/crosvm/sys/unix/config.rs b/src/crosvm/sys/unix/config.rs index b5952da394..19e8a3ef71 100644 --- a/src/crosvm/sys/unix/config.rs +++ b/src/crosvm/sys/unix/config.rs @@ -16,13 +16,27 @@ use crate::crosvm::config::{invalid_value_err, Config}; #[cfg(feature = "gpu")] use crate::crosvm::{argument, argument::parse_hex_or_decimal}; +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub enum HypervisorKind { + Kvm, +} + +impl FromStr for HypervisorKind { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "kvm" => Ok(HypervisorKind::Kvm), + _ => Err("invalid hypervisor backend"), + } + } +} + #[cfg(all(feature = "gpu", feature = "virgl_renderer_next"))] pub fn parse_gpu_render_server_options( s: &str, ) -> Result { - use std::{path::PathBuf, str::FromStr}; - - use crate::crosvm::{config::invalid_value_err, sys::GpuRenderServerParameters}; + use crate::crosvm::sys::GpuRenderServerParameters; let mut path: Option = None; let mut cache_path = None; @@ -65,8 +79,6 @@ pub fn parse_ac97_options( key: &str, #[allow(unused_variables)] value: &str, ) -> Result<(), String> { - use std::{path::PathBuf, str::FromStr}; - match key { #[cfg(feature = "audio_cras")] "client_type" => { @@ -260,8 +272,6 @@ pub fn parse_gpu_options(s: &str) -> Result { use devices::virtio::GpuMode; use rutabaga_gfx::RutabagaWsi; - use crate::crosvm::sys::config::is_gpu_backend_deprecated; - #[cfg(feature = "gfxstream")] let mut vulkan_specified = false; #[cfg(feature = "gfxstream")] diff --git a/src/crosvm/sys/windows.rs b/src/crosvm/sys/windows.rs new file mode 100644 index 0000000000..cabc012745 --- /dev/null +++ b/src/crosvm/sys/windows.rs @@ -0,0 +1,10 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +pub mod cmdline; +pub mod config; + +pub(crate) mod broker; +pub(crate) mod exit; +pub(crate) mod stats; diff --git a/src/crosvm/sys/windows/broker.rs b/src/crosvm/sys/windows/broker.rs new file mode 100644 index 0000000000..b8038f37c1 --- /dev/null +++ b/src/crosvm/sys/windows/broker.rs @@ -0,0 +1,1747 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! Contains the multi-process broker for crosvm. This is a work in progress, and some example +//! structs here are dead code. +#![allow(dead_code)] +use crate::crosvm::sys::windows::exit::{ + to_process_type_error, Exit, ExitCode, ExitCodeWrapper, ExitContext, ExitContextAnyhow, +}; +#[cfg(feature = "crash-report")] +use crash_report::CrashReportAttributes; + +use crate::{bail_exit_code, crosvm::sys::config::ProcessType, ensure_exit_code, Config}; +use anyhow::{anyhow, Context, Result}; +use base::named_pipes::{self, BlockingMode, FramingMode}; +use base::{ + error, info, syslog, warn, AsRawDescriptor, Descriptor, DuplicateHandleRequest, + DuplicateHandleResponse, Event, EventToken, RawDescriptor, ReadNotifier, SafeDescriptor, Timer, + Tube, WaitContext, +}; + +use base::enable_high_res_timers; +use broker_ipc::CommonChildStartupArgs; +#[cfg(feature = "process-invariants")] +use broker_ipc::{init_broker_process_invariants, EmulatorProcessInvariants}; +use devices::virtio::vhost::user::device::NetBackendConfig; +#[cfg(feature = "gpu")] +use gpu_display::EventDevice; +use metrics::event_details_proto::{EmulatorChildProcessExitDetails, RecordDetails}; +use metrics::{self, MetricEventType}; +use net_util::slirp::sys::windows::{SlirpStartupConfig, SLIRP_BUFFER_SIZE}; +use std::boxed::Box; +use std::collections::HashMap; +use std::env::current_exe; +use std::ffi::OsStr; +use std::fmt::{self, Debug, Display, Formatter}; +use std::fs::OpenOptions; +use std::os::windows::io::{AsRawHandle, RawHandle}; +use std::path::{Path, PathBuf}; +use std::process::{self, Command}; +use std::time::Duration; +use tube_transporter::{TubeToken, TubeTransferData, TubeTransporter}; +use win_util::get_exit_code_process; +use winapi::shared::winerror::ERROR_ACCESS_DENIED; +use winapi::um::processthreadsapi::TerminateProcess; +#[cfg(feature = "crash-report")] +use {base::generate_uuid, crash_report::product_type}; + +const KILL_CHILD_EXIT_CODE: u32 = 1; + +/// With the GPU case, only the backend needs the event devices (input device source end), so +/// we have two structs. This one is sent to the backend, and the other goes to the main process. +#[cfg(feature = "gpu")] +struct GpuDeviceBackend { + bootstrap_tube: Tube, + vhost_user: Tube, + event_devices: Vec, +} + +/// Main process end for a GPU device. +#[cfg(feature = "gpu")] +struct GpuDeviceVMM { + bootstrap_tube: Tube, + vhost_user: Tube, +} + +/// Example of the function that would be in linux.rs. +#[cfg(feature = "gpu")] +fn platform_create_gpus(_cfg: Config) -> Vec<(GpuDeviceBackend, GpuDeviceVMM)> { + unimplemented!() +} + +/// This struct represents a configured "disk" device as returned by the platform's API. There will +/// be two instances of it for each disk device, with the Tubes connected appropriately. The broker +/// will send one of these to the main process, and the other to the vhost user disk backend. +struct DiskDeviceEnd { + bootstrap_tube: Tube, + vhost_user: Tube, +} + +/// Example of the function that would be in linux.rs. +fn platform_create_disks(_cfg: Config) -> Vec<(DiskDeviceEnd, DiskDeviceEnd)> { + unimplemented!() +} + +/// Time to wait after a process failure for the remaining processes to exit. When exceeded, all +/// remaining processes, except metrics, will be terminated. +const EXIT_TIMEOUT: Duration = Duration::from_secs(3); +/// Time to wait for the metrics process to flush and upload all logs. +const METRICS_TIMEOUT: Duration = Duration::from_secs(3); + +/// Maps a process type to its sandbox policy configuration. +fn process_policy(process_type: ProcessType, cfg: &Config) -> sandbox::policy::Policy { + #[allow(unused_mut)] + let mut policy = match process_type { + ProcessType::Block => sandbox::policy::BLOCK, + ProcessType::Main => main_process_policy(cfg), + ProcessType::Metrics => sandbox::policy::METRICS, + ProcessType::Net => sandbox::policy::NET, + ProcessType::Slirp => sandbox::policy::SLIRP, + }; + #[cfg(feature = "asan")] + adjust_asan_policy(&mut policy); + #[cfg(feature = "cperfetto")] + adjust_perfetto_policy(&mut policy); + policy +} + +/// Dynamically appends rules to the main process's policy. +fn main_process_policy(cfg: &Config) -> sandbox::policy::Policy { + let mut policy = sandbox::policy::MAIN; + if let Some(host_guid) = &cfg.host_guid { + let rule = sandbox::policy::Rule { + subsystem: sandbox::SubSystem::SUBSYS_FILES, + semantics: sandbox::Semantics::FILES_ALLOW_ANY, + pattern: format!("\\??\\pipe\\{}\\vsock-*", host_guid), + }; + policy.exceptions.push(rule); + } + let blocked_dlls = vec![ + "NahimicOSD.dll", + "XSplitGameSource64.dll", + "TwitchNativeOverlay64.dll", + "GridWndHook.dll", + ]; + for dll in blocked_dlls.iter() { + policy.dll_blocklist.push(dll.to_string()); + } + policy +} + +/// Adjust a policy to allow ASAN builds to write output files. +fn adjust_asan_policy(policy: &mut sandbox::policy::Policy) { + if (policy.initial_token_level as i32) < (sandbox::TokenLevel::USER_RESTRICTED_NON_ADMIN as i32) + { + policy.initial_token_level = sandbox::TokenLevel::USER_RESTRICTED_NON_ADMIN; + } + if (policy.integrity_level as i32) > (sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM as i32) { + policy.integrity_level = sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM; + } +} + +/// Adjust a policy to allow perfetto tracing to open shared memory and use WASAPI. +fn adjust_perfetto_policy(policy: &mut sandbox::policy::Policy) { + if (policy.initial_token_level as i32) + < (sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS as i32) + { + policy.initial_token_level = sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS; + } + + if (policy.lockdown_token_level as i32) + < (sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS as i32) + { + policy.lockdown_token_level = sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS; + } + + if (policy.integrity_level as i32) > (sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM as i32) { + policy.integrity_level = sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM; + } + + if (policy.delayed_integrity_level as i32) + > (sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM as i32) + { + policy.delayed_integrity_level = sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM; + } +} + +/// Wrapper that terminates a child process (if running) when dropped. +struct ChildCleanup { + process_type: ProcessType, + child: Box, + dh_tube: Option, +} + +#[derive(Debug)] +struct UnsandboxedChild(process::Child); +#[derive(Debug)] +struct SandboxedChild(SafeDescriptor); + +impl AsRawDescriptor for UnsandboxedChild { + fn as_raw_descriptor(&self) -> RawDescriptor { + self.0.as_raw_handle() + } +} + +impl AsRawDescriptor for SandboxedChild { + fn as_raw_descriptor(&self) -> RawDescriptor { + self.0.as_raw_descriptor() + } +} + +impl Display for ChildCleanup { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{:?} {:?}", self.process_type, self.child) + } +} + +trait Child: std::fmt::Debug + AsRawDescriptor { + fn wait(&mut self) -> std::io::Result>; + fn try_wait(&mut self) -> std::io::Result>; + fn kill(&mut self) -> std::io::Result<()>; + // Necessary to upcast dyn Child to dyn AsRawDescriptor + fn as_descriptor(&self) -> &dyn AsRawDescriptor; +} + +impl Child for UnsandboxedChild { + fn wait(&mut self) -> std::io::Result> { + Ok(self.0.wait()?.code()) + } + + fn try_wait(&mut self) -> std::io::Result> { + if let Some(status) = self.0.try_wait()? { + Ok(status.code()) + } else { + Ok(None) + } + } + + fn kill(&mut self) -> std::io::Result<()> { + self.0.kill() + } + + fn as_descriptor(&self) -> &dyn AsRawDescriptor { + self + } +} + +impl Child for SandboxedChild { + fn wait(&mut self) -> std::io::Result> { + let wait_ctx = WaitContext::::new()?; + wait_ctx.add(&self.0, 0)?; + let _events = wait_ctx.wait()?; + self.try_wait() + } + + fn try_wait(&mut self) -> std::io::Result> { + get_exit_code_process(self.0.as_raw_descriptor()).map(|code| code.map(|c| c as i32)) + } + + fn kill(&mut self) -> std::io::Result<()> { + if unsafe { TerminateProcess(self.0.as_raw_descriptor(), KILL_CHILD_EXIT_CODE) == 0 } { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } + } + + fn as_descriptor(&self) -> &dyn AsRawDescriptor { + self + } +} + +impl Drop for ChildCleanup { + fn drop(&mut self) { + let kill_process = match self.child.try_wait() { + Ok(None) => true, + Ok(_) => false, + Err(_) => true, + }; + if kill_process { + if let Err(e) = self.child.kill() { + const ACCESS_DENIED: Option = Some(ERROR_ACCESS_DENIED as i32); + if !matches!(e.raw_os_error(), ACCESS_DENIED) { + error!("Failed to clean up child process {}: {}", self, e); + } + } + + // Sending a kill signal does NOT imply the process has exited. Wait for it to exit. + let wait_res = self.child.wait(); + if let Ok(Some(code)) = wait_res.as_ref() { + warn!( + "child process {} killed, exited {}", + self, + ExitCodeWrapper(*code) + ); + } else { + error!( + "failed to wait for child process {} that was terminated: {:?}", + self, wait_res + ); + } + } else { + info!("child process {} already terminated", self); + } + + // Log child exit code regardless of whether we killed it or it exited + // on its own. + { + // Don't even attempt to log metrics process, it doesn't exist to log + // itself. + if self.process_type != ProcessType::Metrics { + let exit_code = self.child.wait(); + if exit_code.is_ok() && exit_code.as_ref().unwrap().is_some() { + let mut details = RecordDetails::new(); + let mut exit_details = EmulatorChildProcessExitDetails::new(); + exit_details.set_exit_code(exit_code.unwrap().unwrap() as u32); + exit_details.set_process_type(self.process_type.into()); + details.set_emulator_child_process_exit_details(exit_details); + metrics::log_event_with_details(MetricEventType::ChildProcessExit, &details); + } else { + error!( + "Failed to log exit code for process: {:?}, couldn't get exit code", + self.process_type + ); + } + } + } + } +} + +/// Represents a child process spawned by the broker. +struct ChildProcess { + // This is unused, but we hold it open to avoid an EPIPE in the child if it doesn't + // immediately read its startup information. We don't use FlushFileBuffers to avoid this because + // that would require blocking the startup sequence. + tube_transporter: TubeTransporter, + + // Used to set up the child process. Unused in steady state. + bootstrap_tube: Tube, + // Child process PID. + process_id: u32, + alias_pid: u32, +} + +/// Wrapper to start the broker. +pub fn run(cfg: Config) -> Result<()> { + // This wrapper exists because errors that are returned up to the caller aren't logged, though + // they are used to generate the return code. For practical debugging though, we want to log the + // errors. + let res = run_internal(cfg); + if let Err(e) = &res { + error!("Broker encountered an error: {}", e); + } + res +} + +#[derive(EventToken)] +enum Token { + Sigterm, + Process(u32), + MainExitTimeout, + DeviceExitTimeout, + MetricsExitTimeout, + SigtermTimeout, + DuplicateHandle(u32), +} + +fn get_log_path(cfg: &Config, file_name: &str) -> Option { + match cfg.logs_directory.as_ref() { + Some(dir) => Some(Path::new(dir).join(file_name)), + None => None, + } +} + +/// Creates a metrics tube pair for communication with the metrics process. +/// The returned Tube will be used by the process producing logs, while +/// the metric_tubes list is sent to the metrics process to receive logs. +/// +/// IMPORTANT NOTE: The metrics process must receive the client (second) end +/// of the Tube pair in order to allow the connection to be properly shut +/// down without data loss. +fn metrics_tube_pair(metric_tubes: &mut Vec) -> Result { + // TODO(nkgold): as written, this Tube pair won't handle ancillary data properly because the + // PIDs are not set properly at each end; however, we don't plan to send ancillary data. + let (t1, t2) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + metric_tubes.push(t2); + Ok(t1) +} + +#[cfg(feature = "crash-report")] +pub fn create_crash_report_attrs(cfg: &Config, product_type: &str) -> CrashReportAttributes { + crash_report::CrashReportAttributes { + product_type: product_type.to_owned(), + pipe_name: cfg.crash_pipe_name.clone(), + report_uuid: cfg.crash_report_uuid.clone(), + product_name: cfg.product_name.clone(), + product_version: cfg.product_version.clone(), + } +} + +/// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid +/// making crash reports incomprehensible. +#[cfg(feature = "crash-report")] +pub fn setup_emulator_crash_reporting(cfg: &Config) -> Result { + crash_report::setup_crash_reporting(create_crash_report_attrs( + cfg, + crash_report::product_type::EMULATOR, + )) + .exit_context( + Exit::CrashReportingInit, + "failed to initialize crash reporting", + ) +} + +/// Starts the broker, which in turn spawns the main process & vhost user devices. +/// General data flow for device & main process spawning: +/// Each platform (e.g. linux.rs) will provide create_inputs/gpus/nets. +/// +/// Those functions will return a list of pairs of structs (containing the pipes and other +/// process specific configuration) for the VMM & backend sides of the device. These structs +/// should be minimal, and not duplicate information that is otherwise available in the Config +/// struct. There MAY be two different types per device, one for the VMM side, and another for +/// the backend. +/// +/// The broker will send all the VMM structs to the main process, and the other structs +/// to the vhost user backends. Every process will get a copy of the Config struct. +/// +/// Finally, the broker will wait on the child processes to exit, and handle errors. +/// +/// Refrain from using platform specific code within this function. It will eventually be cross +/// platform. +fn run_internal(mut cfg: Config) -> Result<()> { + if sandbox::is_sandbox_broker() { + // Get the BrokerServices pointer so that it gets initialized. + sandbox::BrokerServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + // Note that parsing args causes syslog's log file to be set to the log file for the "main" + // process. We don't want broker logs going there, so we fetch our own log file and set it here. + let mut log_cfg = syslog::LogConfig::default(); + if let Some(log_path) = get_log_path(&cfg, "broker_syslog.log") { + log_cfg.pipe = Some(Box::new( + OpenOptions::new() + .append(true) + .create(true) + .open(log_path.as_path()) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", log_path.display()) + })?, + )); + log_cfg.stderr = false; + } else { + log_cfg.stderr = true; + } + syslog::init_with(log_cfg)?; + + #[cfg(feature = "process-invariants")] + let process_invariants = init_broker_process_invariants( + &cfg.process_invariants_data_handle, + &cfg.process_invariants_data_size, + ) + .exit_context( + Exit::ProcessInvariantsInit, + "failed to initialize process invariants", + )?; + + #[cfg(feature = "crash-report")] + init_broker_crash_reporting(&mut cfg)?; + + let _raise_timer_resolution = enable_high_res_timers() + .exit_context(Exit::EnableHighResTimer, "failed to enable high res timers")?; + + // Note: in case of an error / scope exit, any children still in this map will be automatically + // closed. + let mut children: HashMap = HashMap::new(); + + let mut exit_events = Vec::new(); + let mut wait_ctx: WaitContext = WaitContext::new() + .exit_context(Exit::CreateWaitContext, "failed to create event context")?; + + // Hook ^C / SIGTERM so we can handle it gracefully. + let sigterm_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + let sigterm_event_ctrlc = sigterm_event + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?; + ctrlc::set_handler(move || { + sigterm_event_ctrlc.write(0).unwrap(); + }) + .exit_context(Exit::SetSigintHandler, "failed to set sigint handler")?; + wait_ctx.add(&sigterm_event, Token::Sigterm).exit_context( + Exit::WaitContextAdd, + "failed to add trigger to event context", + )?; + + let mut metric_tubes = Vec::new(); + let metrics_controller = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["run-metrics"], + get_log_path(&cfg, "metrics_stdout.log"), + get_log_path(&cfg, "metrics_stderr.log"), + ProcessType::Metrics, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ + cfg.jail_config.is_some(), + Vec::new(), + &cfg, + )?; + metrics_controller + .tube_transporter + .serialize_and_transport(metrics_controller.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + let mut main_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["run-main"], + get_log_path(&cfg, "main_stdout.log"), + get_log_path(&cfg, "main_stderr.log"), + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ + cfg.jail_config.is_some(), + Vec::new(), + &cfg, + )?; + + // Save block children `ChildProcess` so TubeTransporter and Tubes don't get closed. + let _block_children = start_up_block_backends( + &mut cfg, + &mut children, + &mut exit_events, + &mut wait_ctx, + &mut main_child, + &mut metric_tubes, + #[cfg(feature = "process-invariants")] + &process_invariants, + )?; + + let (_slirp_child, _net_children) = start_up_net_backend( + &mut main_child, + &mut children, + &mut exit_events, + &mut wait_ctx, + &mut cfg, + &mut metric_tubes, + #[cfg(feature = "process-invariants")] + &process_invariants, + )?; + + // Wait until all device processes are spun up so main TubeTransporter will have all the + // device control and Vhost tubes. + main_child + .tube_transporter + .serialize_and_transport(main_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + main_child.bootstrap_tube.send(&cfg).unwrap(); + + let main_startup_args = CommonChildStartupArgs::new( + get_log_path(&cfg, "main_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(&cfg, product_type::EMULATOR), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(&mut metric_tubes)?), + )?; + main_child.bootstrap_tube.send(&main_startup_args).unwrap(); + + let exit_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + main_child.bootstrap_tube.send(&exit_event).unwrap(); + exit_events.push(exit_event); + + // Setup our own metrics agent + { + let broker_metrics = metrics_tube_pair(&mut metric_tubes)?; + metrics::initialize(broker_metrics); + #[cfg(feature = "kiwi")] + { + let use_vulkan = if cfg!(feature = "gpu") { + match &cfg.gpu_parameters { + Some(params) => Some(params.use_vulkan), + None => { + warn!("No GPU parameters set on CrosVM config."); + None + } + } + } else { + None + }; + anti_tamper::setup_common_metric_invariants( + &&cfg.product_version, + &cfg.product_channel, + &use_vulkan, + ); + } + } + + // We have all the metrics tubes from other children, so give them to the metrics controller + // along with a startup configuration. + let metrics_startup_args = CommonChildStartupArgs::new( + get_log_path(&cfg, "metrics_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(&cfg, product_type::METRICS), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + None, + )?; + metrics_controller + .bootstrap_tube + .send(&metrics_startup_args) + .unwrap(); + + metrics_controller + .bootstrap_tube + .send(&metric_tubes) + .unwrap(); + + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) +} + +/// Shuts down the metrics process, waiting for it to close to ensure +/// all logs are flushed. +fn clean_up_metrics(metrics_child: ChildCleanup) -> Result<()> { + // This will close the final metrics connection, triggering a metrics + // process shutdown. + metrics::get_destructor().cleanup(); + + // However, we still want to wait for the metrics process to finish + // flushing any pending logs before exiting. + let metrics_cleanup_wait = WaitContext::::new().exit_context( + Exit::CreateWaitContext, + "failed to create metrics wait context", + )?; + let mut metrics_timeout = + Timer::new().exit_context(Exit::CreateTimer, "failed to create metrics timeout timer")?; + metrics_timeout + .reset(EXIT_TIMEOUT, None) + .exit_context(Exit::ResetTimer, "failed to reset timer")?; + metrics_cleanup_wait.add(&metrics_timeout, 0).exit_context( + Exit::WaitContextAdd, + "failed to add metrics timout to wait context", + )?; + metrics_cleanup_wait + .add(metrics_child.child.as_descriptor(), 1) + .exit_context( + Exit::WaitContextAdd, + "failed to add metrics process to wait context", + )?; + let events = metrics_cleanup_wait + .wait() + .context("failed to wait for metrics context")?; + + let mut process_exited = false; + if events + .iter() + .find(|e| e.is_readable && e.token == 1) + .is_some() + { + process_exited = true; + } + + if !process_exited { + warn!( + "broker: Metrics process timed out before cleanly exiting. + This may indicate some logs remain unsent." + ); + // Process will be force-killed on drop + } + + Ok(()) +} + +#[cfg(feature = "crash-report")] +fn init_broker_crash_reporting(cfg: &mut Config) -> Result<()> { + cfg.crash_report_uuid = Some(generate_uuid()); + if cfg.crash_pipe_name.is_none() { + // We weren't started by the service. Spin up a crash reporter to be shared with all + // children. + cfg.crash_pipe_name = Some( + crash_report::setup_crash_reporting(create_crash_report_attrs( + &cfg, + product_type::BROKER, + )) + .exit_context(Exit::CrashReportingInit, "failed to init crash reporting")?, + ); + } else { + crash_report::setup_crash_reporting(create_crash_report_attrs(&cfg, product_type::BROKER)) + .exit_context(Exit::CrashReportingInit, "failed to init crash reporting")?; + } + + Ok(()) +} + +struct Supervisor { + children: HashMap, + wait_ctx: WaitContext, + exit_events: Vec, + exit_timer: Option, +} + +impl Supervisor { + pub fn broker_supervise_loop( + children: HashMap, + wait_ctx: WaitContext, + exit_events: Vec, + ) -> Result<()> { + let mut supervisor = Supervisor { + children, + wait_ctx, + exit_events, + exit_timer: None, + }; + let result = supervisor.broker_loop(); + + // Once supervise loop exits, we are exiting and just need to clean + // up. In error cases, there could still be children processes, so we close + // those first, and finally drop the metrics process. + supervisor.children.retain(|_, child| { + match child.process_type { + ProcessType::Metrics => true, + _ => { + warn!( + "broker: Forcibly closing child (type: {:?}). This often means + the child was unable to close within the normal timeout window, + or the broker itself failed with an error.", + child.process_type + ); + // Child killed on drop + false + } + } + }); + + { + if supervisor.is_only_metrics_process_running() { + clean_up_metrics(supervisor.children.into_values().next().unwrap())?; + } else { + warn!( + "broker: Metrics process not running after cleanup. + This may indicate some exit logs have been dropped." + ); + } + } + + result + } + + /// We require exactly one main process. + fn assert_children_sane(&mut self) { + let main_processes = self + .children + .iter() + .filter(|(_, child)| child.process_type == ProcessType::Main) + .count(); + if main_processes != 1 { + // Why do we have to clear children? Well, panic *can* cause destructors not to run, + // which means these children won't run. The exact explanation for this isn't clear, but + // it reproduced consistently. So since we're panicking, we'll be careful. + self.children.clear(); + panic!( + "Broker must supervise exactly one main process. Got {} main process(es).", + main_processes, + ) + } + } + + fn is_only_metrics_process_running(&self) -> bool { + self.children.len() == 1 + && self.children.values().next().unwrap().process_type == ProcessType::Metrics + } + + fn all_non_metrics_processes_exited(&self) -> bool { + #[cfg(not(feature = "kiwi"))] + return self.children.len() == 0; + #[cfg(feature = "kiwi")] + return self.children.len() == 0 || self.is_only_metrics_process_running(); + } + + fn start_exit_timer(&mut self, timeout_token: Token) -> Result<()> { + if self.exit_timer.is_some() { + return Ok(()); + } + + let mut et = Timer::new().exit_context(Exit::CreateTimer, "failed to create timer")?; + et.reset(EXIT_TIMEOUT, None) + .exit_context(Exit::ResetTimer, "failed to reset timer")?; + self.wait_ctx.add(&et, timeout_token).exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + self.exit_timer = Some(et); + + Ok(()) + } + + /// Once children have been spawned, this function is called to run the supervision loop, which + /// waits for processes to exit and handles errors. + fn broker_loop(&mut self) -> Result<()> { + const KILLED_BY_SIGNAL: ExitCode = Exit::KilledBySignal as ExitCode; + self.assert_children_sane(); + let mut first_nonzero_exitcode = None; + + while !self.all_non_metrics_processes_exited() { + let events = self + .wait_ctx + .wait() + .context("failed to wait for event context")?; + + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + Token::Sigterm => { + // Signal all children other than metrics to exit. + for exit_event in &self.exit_events { + if let Err(e) = exit_event.write(1) { + error!("failed to signal exit event to child: {}", e); + } + } + first_nonzero_exitcode.get_or_insert(KILLED_BY_SIGNAL); + self.start_exit_timer(Token::SigtermTimeout)?; + } + Token::Process(child_id) => { + let mut child = self.children.remove(&child_id).unwrap(); + let process_handle = Descriptor(child.child.as_raw_descriptor()); + self.wait_ctx.delete(&process_handle).exit_context( + Exit::WaitContextDelete, + "failed to remove trigger from event context", + )?; + if let Some(dh_tube) = child.dh_tube.as_ref() { + self.wait_ctx + .delete(dh_tube.get_read_notifier()) + .exit_context( + Exit::WaitContextDelete, + "failed to remove trigger from event context", + )?; + } + + let exit_code = child.child.wait().unwrap().unwrap(); + info!( + "broker: child (type {:?}) exited {}", + child.process_type, + ExitCodeWrapper(exit_code), + ); + + // Save the child's exit code (to pass through to the broker's exit code) if + // none has been saved or if the previously saved exit code was + // KilledBySignal. We overwrite KilledBySignal because the child exit may + // race with the sigterm from the service, esp if child exit is slowed by a Crashpad + // dump, and we don't want to lose the child's exit code if it was the + // initial cause of the emulator failing. + if exit_code != 0 + && (first_nonzero_exitcode.is_none() + || matches!(first_nonzero_exitcode, Some(KILLED_BY_SIGNAL))) + { + info!( + "setting first_nonzero_exitcode {:?} -> {}", + first_nonzero_exitcode, exit_code, + ); + first_nonzero_exitcode = + Some(to_process_type_error(exit_code as u32, child.process_type) + as i32); + } + + let timeout_token = match child.process_type { + ProcessType::Main => Token::MainExitTimeout, + ProcessType::Metrics => Token::MetricsExitTimeout, + _ => Token::DeviceExitTimeout, + }; + self.start_exit_timer(timeout_token)?; + } + Token::SigtermTimeout => { + if let Some(exit_code) = first_nonzero_exitcode { + if exit_code != KILLED_BY_SIGNAL { + bail_exit_code!( + exit_code, + "broker got sigterm, but a child exited with an error.", + ); + } + } + ensure_exit_code!( + self.all_non_metrics_processes_exited(), + Exit::BrokerSigtermTimeout, + "broker got sigterm, but other broker children did not exit within the \ + timeout", + ); + } + Token::MainExitTimeout => { + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + "main exited, but a child exited with an error.", + ); + } + ensure_exit_code!( + self.all_non_metrics_processes_exited(), + Exit::BrokerMainExitedTimeout, + "main exited, but other broker children did not exit within the \ + timeout", + ); + } + Token::DeviceExitTimeout => { + // A device process exited, but there are still other processes running. + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + "a device exited, and either it or another child exited with an \ + error.", + ); + } + ensure_exit_code!( + self.all_non_metrics_processes_exited(), + Exit::BrokerDeviceExitedTimeout, + "device exited, but other broker children did not exit within the \ + timeout", + ); + } + Token::MetricsExitTimeout => { + // The metrics server exited, but there are still other processes running. + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + "metrics server exited, and either it or another child exited with \ + an error.", + ); + } + ensure_exit_code!( + self.children.len() == 0, + Exit::BrokerMetricsExitedTimeout, + "metrics exited, but other broker children did not exit within the \ + timeout", + ); + } + Token::DuplicateHandle(child_id) => { + if let Some(tube) = &self.children[&child_id].dh_tube { + let req: DuplicateHandleRequest = tube + .recv() + .exit_context(Exit::TubeFailure, "failed operation on tube")?; + if !self.children.contains_key(&req.target_alias_pid) { + error!( + "DuplicateHandleRequest contained invalid alias pid: {}", + req.target_alias_pid + ); + tube.send(&DuplicateHandleResponse { handle: None }) + .exit_context(Exit::TubeFailure, "failed operation on tube")?; + } else { + let target = &self.children[&req.target_alias_pid].child; + let handle = win_util::duplicate_handle_from_source_process( + self.children[&child_id].child.as_raw_descriptor(), + req.handle as RawHandle, + target.as_raw_descriptor(), + ); + match handle { + Ok(handle) => tube + .send(&DuplicateHandleResponse { + handle: Some(handle as usize), + }) + .exit_context( + Exit::TubeFailure, + "failed operation on tube", + )?, + Err(e) => { + error!("Failed to duplicate handle: {}", e); + tube.send(&DuplicateHandleResponse { handle: None }) + .exit_context( + Exit::TubeFailure, + "failed operation on tube", + )? + } + }; + } + } + } + } + } + } + + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + if exit_code == KILLED_BY_SIGNAL { + "broker got sigterm, and all children exited zero from shutdown event." + } else { + "all processes exited, but at least one encountered an error." + }, + ); + } + + Ok(()) + } +} + +fn start_up_block_backends( + cfg: &mut Config, + children: &mut HashMap, + exit_events: &mut Vec, + wait_ctx: &mut WaitContext, + main_child: &mut ChildProcess, + metric_tubes: &mut Vec, + #[cfg(feature = "process-invariants")] process_invariants: &EmulatorProcessInvariants, +) -> Result> { + let mut block_children = Vec::new(); + let disk_options = cfg.disks.clone(); + for (index, disk_option) in disk_options.iter().enumerate() { + let block_child = spawn_block_backend(index, main_child, children, wait_ctx, cfg)?; + + let startup_args = CommonChildStartupArgs::new( + get_log_path(cfg, &format!("disk_{}_syslog.log", index)), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(cfg, &format!("{}_{}", product_type::DISK, index)), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(metric_tubes)?), + )?; + block_child.bootstrap_tube.send(&startup_args).unwrap(); + + block_child.bootstrap_tube.send(&disk_option).unwrap(); + + let exit_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + block_child.bootstrap_tube.send(&exit_event).unwrap(); + exit_events.push(exit_event); + block_children.push(block_child); + } + + Ok(block_children) +} + +fn spawn_block_backend( + log_index: usize, + main_child: &mut ChildProcess, + children: &mut HashMap, + wait_ctx: &mut WaitContext, + cfg: &mut Config, +) -> Result { + let (mut vhost_user_main_tube, mut vhost_user_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + let (mut disk_host_tube, mut disk_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + disk_device_tube.set_target_pid(main_child.alias_pid); + vhost_user_device_tube.set_target_pid(main_child.alias_pid); + let block_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["device", "block"], + get_log_path(&cfg, &format!("disk_{}_stdout.log", log_index)), + get_log_path(&cfg, &format!("disk_{}_stderr.log", log_index)), + ProcessType::Block, + children, + wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ + cfg.jail_config.is_some(), + vec![ + TubeTransferData { + tube: disk_device_tube, + tube_token: TubeToken::Control, + }, + TubeTransferData { + tube: vhost_user_device_tube, + tube_token: TubeToken::VhostUser, + }, + ], + cfg, + )?; + + block_child + .tube_transporter + .serialize_and_transport(block_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + vhost_user_main_tube.set_target_pid(block_child.alias_pid); + disk_host_tube.set_target_pid(block_child.alias_pid); + cfg.block_control_tube.push(disk_host_tube); + cfg.block_vhost_user_tube.push(vhost_user_main_tube); + + Ok(block_child) +} + +fn spawn_sandboxed_child( + program: &str, + args: I, + stdout_file: Option, + stderr_file: Option, + handles_to_inherit: Vec<&dyn AsRawDescriptor>, + process_policy: sandbox::policy::Policy, +) -> Result<(u32, Box)> +where + I: IntoIterator, + S: AsRef, +{ + let mut broker = sandbox::BrokerServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")? + .unwrap(); + let mut policy = broker.create_policy(); + policy + .set_token_level( + process_policy.initial_token_level, + process_policy.lockdown_token_level, + ) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + policy + .set_job_level(process_policy.job_level, 0) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + policy + .set_integrity_level(process_policy.integrity_level) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + policy + .set_delayed_integrity_level(process_policy.delayed_integrity_level) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + + if process_policy.alternate_desktop { + policy + .set_alternate_desktop(process_policy.alternate_winstation) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + for rule in process_policy.exceptions { + policy + .add_rule(rule.subsystem, rule.semantics, rule.pattern) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + policy.set_lockdown_default_dacl(); + + if let Some(file) = stdout_file.as_ref() { + policy + .set_stdout_from_file(file) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + if let Some(file) = stderr_file.as_ref() { + policy + .set_stderr_from_file(file) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + for handle in handles_to_inherit.into_iter() { + policy.add_handle_to_share(handle); + } + + for dll in process_policy.dll_blocklist.into_iter() { + policy + .add_dll_to_unload(&dll) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + // spawn_target uses CreateProcessW to create a new process, which will pass + // the command line arguments verbatim to the new process. Most processes + // expect that argv[0] will be the program name, so provide that before the + // rest of the args. + let command_line = args + .into_iter() + .fold(format!("\"{}\"", program), |mut args, arg| { + args.push(' '); + args.push_str(OsStr::new(&arg).to_str().unwrap()); + args + }); + + let (target, warning) = broker + .spawn_target(program, &command_line, &policy) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + if let Some(w) = warning { + warn!("sandbox: got warning spawning target: {}", w); + } + win_util::resume_thread(target.thread.as_raw_descriptor()) + .exit_context(Exit::ProcessSpawnFailed, "failed to spawn child process")?; + + Ok((target.process_id, Box::new(SandboxedChild(target.process)))) +} + +fn spawn_unsandboxed_child( + program: &str, + args: I, + stdout_file: Option, + stderr_file: Option, + handles_to_inherit: Vec<&dyn AsRawDescriptor>, +) -> Result<(u32, Box)> +where + I: IntoIterator, + S: AsRef, +{ + let mut proc = Command::new(program); + + let proc = proc.args(args); + + for handle in handles_to_inherit.iter() { + win_util::set_handle_inheritance(handle.as_raw_descriptor(), /* inheritable= */ true) + .exit_context(Exit::CreateSocket, "failed to create socket")?; + } + + if let Some(file) = stdout_file { + proc.stdout(file); + } + + if let Some(file) = stderr_file { + proc.stderr(file); + } + + info!("spawning process: {:?}", proc); + let proc = proc + .spawn() + .exit_context(Exit::ProcessSpawnFailed, "failed to spawn child process")?; + + for handle in handles_to_inherit.iter() { + win_util::set_handle_inheritance(handle.as_raw_descriptor(), /* inheritable= */ false) + .exit_context(Exit::CreateSocket, "failed to create socket")?; + } + + let process_id = proc.id(); + + Ok((process_id, Box::new(UnsandboxedChild(proc)))) +} + +fn start_up_net_backend( + main_child: &mut ChildProcess, + children: &mut HashMap, + exit_events: &mut Vec, + wait_ctx: &mut WaitContext, + cfg: &mut Config, + metric_tubes: &mut Vec, + #[cfg(feature = "process-invariants")] process_invariants: &EmulatorProcessInvariants, +) -> Result<(ChildProcess, ChildProcess)> { + let (host_pipe, guest_pipe) = named_pipes::pair_with_buffer_size( + &FramingMode::Message, + &BlockingMode::Wait, + /* timeout= */ 0, + /* buffer_size= */ SLIRP_BUFFER_SIZE, + /* overlapped= */ true, + ) + .expect("Failed to create named pipe pair."); + let slirp_kill_event = Event::new().expect("Failed to create slirp kill event."); + + let slirp_child = spawn_slirp(children, wait_ctx, cfg)?; + + let slirp_child_startup_args = CommonChildStartupArgs::new( + get_log_path(cfg, "slirp_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(cfg, product_type::SLIRP), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(metric_tubes)?), + )?; + slirp_child + .bootstrap_tube + .send(&slirp_child_startup_args) + .unwrap(); + + let slirp_config = SlirpStartupConfig { + slirp_pipe: host_pipe, + shutdown_event: slirp_kill_event + .try_clone() + .expect("Failed to clone slirp kill event."), + #[cfg(feature = "slirp-ring-capture")] + slirp_capture_file: cfg.slirp_capture_file.take(), + }; + slirp_child.bootstrap_tube.send(&slirp_config).unwrap(); + + let net_child = spawn_net_backend(main_child, children, wait_ctx, cfg)?; + + let net_child_startup_args = CommonChildStartupArgs::new( + get_log_path(cfg, "net_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(cfg, product_type::SLIRP), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(metric_tubes)?), + )?; + net_child + .bootstrap_tube + .send(&net_child_startup_args) + .unwrap(); + + let net_backend_config = NetBackendConfig { + guest_pipe, + slirp_kill_event, + }; + net_child.bootstrap_tube.send(&net_backend_config).unwrap(); + let exit_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + net_child.bootstrap_tube.send(&exit_event).unwrap(); + exit_events.push(exit_event); + + Ok((slirp_child, net_child)) +} + +fn spawn_slirp( + children: &mut HashMap, + wait_ctx: &mut WaitContext, + cfg: &mut Config, +) -> Result { + let slirp_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["run-slirp"], + get_log_path(&cfg, "slirp_stdout.log"), + get_log_path(&cfg, "slirp_stderr.log"), + ProcessType::Slirp, + children, + wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ cfg.jail_config.is_some(), + vec![], + cfg, + )?; + + slirp_child + .tube_transporter + .serialize_and_transport(slirp_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + Ok(slirp_child) +} + +fn spawn_net_backend( + main_child: &mut ChildProcess, + children: &mut HashMap, + wait_ctx: &mut WaitContext, + cfg: &mut Config, +) -> Result { + let (mut vhost_user_main_tube, mut vhost_user_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + vhost_user_device_tube.set_target_pid(main_child.alias_pid); + + let net_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["device", "net"], + get_log_path(&cfg, "net_stdout.log"), + get_log_path(&cfg, "net_stderr.log"), + ProcessType::Net, + children, + wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ cfg.jail_config.is_some(), + vec![TubeTransferData { + tube: vhost_user_device_tube, + tube_token: TubeToken::VhostUser, + }], + cfg, + )?; + + net_child + .tube_transporter + .serialize_and_transport(net_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + vhost_user_main_tube.set_target_pid(net_child.alias_pid); + cfg.net_vhost_user_tube = Some(vhost_user_main_tube); + + Ok(net_child) +} + +/// Spawns a child process, sending it a control tube as the --bootstrap=HANDLE_NUMBER argument. +/// stdout & stderr are redirected to the provided file paths. +fn spawn_child( + program: &str, + args: I, + stdout_path: Option, + stderr_path: Option, + process_type: ProcessType, + children: &mut HashMap, + wait_ctx: &mut WaitContext, + #[cfg(test)] skip_bootstrap: bool, + use_sandbox: bool, + mut tubes: Vec, + #[allow(unused_variables)] cfg: &Config, +) -> Result +where + I: IntoIterator, + S: AsRef, +{ + let (tube_transport_pipe, tube_transport_main_child) = named_pipes::pair( + &FramingMode::Message, + &BlockingMode::Wait, + /* timeout= */ 0, + ) + .exit_context(Exit::CreateSocket, "failed to create socket")?; + + let stdout_file = if let Some(path) = stdout_path { + Some( + OpenOptions::new() + .append(true) + .create(true) + .open(path.as_path()) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", path.display()) + })?, + ) + } else { + None + }; + + let stderr_file = if let Some(path) = stderr_path { + Some( + OpenOptions::new() + .append(true) + .create(true) + .open(path.as_path()) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", path.display()) + })?, + ) + } else { + None + }; + + #[cfg(test)] + let bootstrap = if !skip_bootstrap { + vec![ + "--bootstrap".to_string(), + (tube_transport_main_child.as_raw_descriptor() as usize).to_string(), + ] + } else { + vec![] + }; + #[cfg(not(test))] + let bootstrap = vec![ + "--bootstrap".to_string(), + (tube_transport_main_child.as_raw_descriptor() as usize).to_string(), + ]; + + let input_args: Vec = args.into_iter().collect(); + let args = input_args + .iter() + .map(|arg| arg.as_ref()) + .chain(bootstrap.iter().map(|arg| arg.as_ref())); + + let (process_id, child) = if use_sandbox { + spawn_sandboxed_child( + program, + args, + stdout_file, + stderr_file, + vec![&tube_transport_main_child], + process_policy(process_type, cfg), + )? + } else { + spawn_unsandboxed_child( + program, + args, + stdout_file, + stderr_file, + vec![&tube_transport_main_child], + )? + }; + + let (mut bootstrap_tube, bootstrap_tube_child) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + // Make sure our end of the Tube knows the PID of the child end. + bootstrap_tube.set_target_pid(process_id); + + tubes.push(TubeTransferData { + tube: bootstrap_tube_child, + tube_token: TubeToken::Bootstrap, + }); + + let (dh_tube, dh_tube_child, alias_pid) = if use_sandbox { + let (broker, child) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + (Some(broker), Some(child), rand::random()) + } else { + (None, None, process_id) + }; + + let tube_transporter = + TubeTransporter::new(tube_transport_pipe, tubes, Some(alias_pid), dh_tube_child); + + // Register this child to be waited upon. + let process_handle = Descriptor(child.as_raw_descriptor()); + wait_ctx + .add(&process_handle, Token::Process(alias_pid)) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to event context", + )?; + + children.insert( + alias_pid, + ChildCleanup { + process_type, + child, + dh_tube, + }, + ); + + if use_sandbox { + wait_ctx + .add( + children[&alias_pid] + .dh_tube + .as_ref() + .unwrap() + .get_read_notifier(), + Token::DuplicateHandle(alias_pid), + ) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to event context", + )?; + } + + Ok(ChildProcess { + bootstrap_tube, + tube_transporter, + process_id, + alias_pid, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use base::thread::spawn_with_timeout; + + /// Verifies that the supervisor loop exits normally with a single child that exits. + #[test] + fn smoke_test() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events).unwrap(); + }) + .try_join(Duration::from_secs(5)) + .unwrap(); + } + + /// Verifies that the supervisor loop exits normally when a device exits first, and then + /// the main loop exits. + #[test] + fn main_and_device_clean_exit() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["3"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events).unwrap(); + }) + .try_join(Duration::from_secs(5)) + .unwrap(); + } + + /// Verifies that the supervisor loop ends even if a device takes too long to exit. + #[test] + fn device_takes_too_long_to_exit() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "sleep", + &["10"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) + .to_exit_code() + .unwrap(), + ExitCode::from(Exit::BrokerMainExitedTimeout), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + } + + /// Verifies that the supervisor loop ends even if the main process takes too long to exit. + #[test] + fn main_takes_too_long_to_exit() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["10"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) + .to_exit_code() + .unwrap(), + ExitCode::from(Exit::BrokerDeviceExitedTimeout), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + } + + /// Verifies that the supervisor loop ends even if a device takes too long to exit. + #[test] + fn device_crash_returns_child_error() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "cmd", + &["/c", "exit -1"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) + .to_exit_code() + .unwrap(), + ExitCode::from(to_process_type_error(-1i32 as u32, ProcessType::Block) as i32), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + } + + /// Verifies that sigterm makes the supervisor loop signal the exit event. + #[test] + fn sigterm_signals_exit_event() { + let exit_event = Event::new().unwrap(); + let exit_event_copy = exit_event.try_clone().unwrap(); + + spawn_with_timeout(move || { + let sigterm_event = Event::new().unwrap(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let mut children: HashMap = HashMap::new(); + let _child_main = spawn_child( + "sleep", + &["2"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + wait_ctx.add(&sigterm_event, Token::Sigterm).unwrap(); + sigterm_event.write(1).unwrap(); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, vec![exit_event_copy]) + .to_exit_code() + .unwrap(), + ExitCode::from(Exit::KilledBySignal), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + + exit_event.read_timeout(Duration::from_secs(0)).unwrap(); + } +} diff --git a/src/crosvm/sys/windows/cmdline.rs b/src/crosvm/sys/windows/cmdline.rs new file mode 100644 index 0000000000..699336a497 --- /dev/null +++ b/src/crosvm/sys/windows/cmdline.rs @@ -0,0 +1,85 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use argh::FromArgs; + +use argh_helpers::generate_catchall_args; +#[derive(Debug, FromArgs)] +#[argh(subcommand)] +/// Windows Devices +pub enum DevicesSubcommand {} + +#[cfg(feature = "slirp")] +#[generate_catchall_args] +#[argh(subcommand, name = "run-slirp")] +/// Start a new metrics instance +pub struct RunSlirpCommand {} + +#[generate_catchall_args] +#[argh(subcommand, name = "run-main")] +/// Start a new broker instance +pub struct RunMainCommand {} + +#[generate_catchall_args] +#[argh(subcommand, name = "run-metrics")] +/// Start a new metrics instance +pub struct RunMetricsCommand {} + +/// Start a new mp crosvm instance +#[generate_catchall_args] +#[argh(subcommand, name = "run-mp")] +pub struct RunMPCommand {} + +#[derive(FromArgs)] +#[argh(subcommand)] +/// Windows Devices +pub enum Commands { + RunMetrics(RunMetricsCommand), + RunMP(RunMPCommand), + #[cfg(feature = "slirp")] + RunSlirp(RunSlirpCommand), + RunMain(RunMainCommand), +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::crosvm::cmdline::RunCommand; + + fn get_args() -> Vec<&'static str> { + vec!["--bios", "C:\\src\\crosvm\\out\\image\\default\\images\\bios.rom", + "--crash-pipe-name", "\\\\.\\pipe\\crashpad_27812_XGTCCTBYULHHLEJU", "--cpus", "4", + "--mem", "8192", + "--log-file", "C:\\tmp\\Emulator.log", + "--kernel-log-file", "C:\\tmp\\Hypervisor.log", + "--logs-directory", "C:\\tmp\\emulator_logs", + "--serial", "hardware=serial,num=1,type=file,path=C:\\tmp\\AndroidSerial.log,earlycon=true", + "--serial", "hardware=virtio-console,num=1,type=file,path=C:\\tmp\\AndroidSerial.log,console=true", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\aggregate.img", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\metadata.img", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\userdata.img", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\misc.img", + "--process-invariants-handle", "7368", "--process-invariants-size", "568", + "--gpu", "angle=true,backend=gfxstream,egl=true,gles=false,glx=false,refresh_rate=60,surfaceless=false,vulkan=true,wsi=vk,display_mode=borderless_full_screen,hidden", + "--host-guid", "09205719-879f-4324-8efc-3e362a4096f4", + "--ac97", "backend=win_audio", + "--cid", "3", "--multi-touch", "nil", "--mouse", "nil", "--product-version", "99.9.9.9", + "--product-channel", "Local", "--product-name", "Play Games", + "--service-pipe-name", "service-ipc-8244a83a-ae3f-486f-9c50-3fc47b309d27", + "--pstore", "path=C:\\tmp\\pstore,size=1048576", + "--pvclock", + "--params", "fake args"] + } + + #[test] + fn parse_run_mp_test() { + let _ = RunMPCommand::from_args(&[&"run-mp"], &get_args()).unwrap(); + } + + #[test] + fn parse_run_test() { + let _ = RunCommand::from_args(&[&"run-main"], &get_args()).unwrap(); + } +} diff --git a/src/crosvm/sys/windows/config.rs b/src/crosvm/sys/windows/config.rs new file mode 100644 index 0000000000..91dddb27e0 --- /dev/null +++ b/src/crosvm/sys/windows/config.rs @@ -0,0 +1,822 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::str::FromStr; + +#[cfg(feature = "gpu")] +use base::info; +#[cfg(all(feature = "prod-build", feature = "kiwi"))] +use devices::serial_device::SerialType; +#[cfg(feature = "gpu")] +use devices::virtio::{GpuDisplayMode, GpuDisplayParameters, GpuMode, GpuParameters}; +use devices::Ac97Parameters; +use devices::SerialParameters; +use metrics::event_details_proto::EmulatorProcessType; +#[cfg(feature = "gpu")] +use rutabaga_gfx::{calculate_context_mask, RutabagaWsi}; +use serde::{Deserialize, Serialize}; + +use crate::crosvm::{argument, config::Config}; + +#[cfg(feature = "audio")] +pub fn parse_ac97_options( + _ac97_params: &mut Ac97Parameters, + key: &str, + value: &str, +) -> Result<(), String> { + Err(format!("unknown ac97 parameter {} {}", key, value)) +} + +#[cfg(feature = "audio")] +pub(crate) fn check_ac97_backend( + #[allow(unused_variables)] ac97_params: &Ac97Parameters, +) -> Result<(), String> { + Ok(()) +} + +#[cfg(feature = "gpu")] +pub fn is_gpu_backend_deprecated(backend: &str) -> bool { + match backend { + "2d" | "2D" | "3d" | "3D" | "virglrenderer" => { + cfg!(feature = "gfxstream") + } + _ => false, + } +} + +#[cfg(feature = "gfxstream")] +pub fn use_vulkan() -> bool { + false +} + +pub fn check_serial_params( + #[allow(unused_variables)] serial_params: &SerialParameters, +) -> Result<(), String> { + #[cfg(all(feature = "prod-build", feature = "kiwi"))] + { + if matches!(serial_params.type_, SerialType::SystemSerialType) { + return Err(format!( + "device type not supported: {}", + serial_params.type_.to_string() + )); + } + if serial_params.stdin { + return Err(format!("parameter not supported: stdin")); + } + } + Ok(()) +} + +pub fn validate_config(_cfg: &mut Config) -> std::result::Result<(), String> { + Ok(()) +} + +#[cfg(feature = "gpu")] +pub fn parse_gpu_options(s: &str) -> Result { + parse_gpu_options_inner(s).map_err(|e| e.to_string()) +} + +#[cfg(feature = "gpu")] +fn parse_gpu_options_inner(s: &str) -> argument::Result { + let mut gpu_params: GpuParameters = Default::default(); + #[cfg(feature = "gfxstream")] + let mut vulkan_specified = false; + #[cfg(feature = "gfxstream")] + let mut syncfd_specified = false; + #[cfg(feature = "gfxstream")] + let mut gles31_specified = false; + #[cfg(feature = "gfxstream")] + let mut angle_specified = false; + + let mut width: Option = None; + let mut height: Option = None; + let mut dpi: Option = None; + let mut display_mode: Option = None; + #[cfg(feature = "gfxstream")] + let mut vsync: Option = None; + let opts = s + .split(',') + .map(|frag| frag.split('=')) + .map(|mut kv| (kv.next().unwrap_or(""), kv.next().unwrap_or(""))); + let mut hidden: Option = None; + + for (k, v) in opts { + match k { + "backend" => match v { + "2d" | "2D" => { + if crate::crosvm::sys::config::is_gpu_backend_deprecated(v) { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "this backend type is deprecated, please use gfxstream.", + ), + }); + } else { + gpu_params.mode = GpuMode::Mode2D; + } + } + "3d" | "3D" | "virglrenderer" => { + if crate::crosvm::sys::config::is_gpu_backend_deprecated(v) { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "this backend type is deprecated, please use gfxstream.", + ), + }); + } else { + gpu_params.mode = GpuMode::ModeVirglRenderer; + } + } + #[cfg(feature = "gfxstream")] + "gfxstream" => { + gpu_params.mode = GpuMode::ModeGfxstream; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + #[cfg(feature = "gfxstream")] + "gpu parameter 'backend' should be one of (2d|virglrenderer|gfxstream)", + #[cfg(not(feature = "gfxstream"))] + "gpu parameter 'backend' should be one of (2d|3d)", + ), + }); + } + }, + "egl" => match v { + "true" | "" => { + gpu_params.renderer_use_egl = true; + } + "false" => { + gpu_params.renderer_use_egl = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'egl' should be a boolean"), + }); + } + }, + "gles" => match v { + "true" | "" => { + gpu_params.renderer_use_gles = true; + } + "false" => { + gpu_params.renderer_use_gles = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'gles' should be a boolean"), + }); + } + }, + "glx" => match v { + "true" | "" => { + gpu_params.renderer_use_glx = true; + } + "false" => { + gpu_params.renderer_use_glx = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'glx' should be a boolean"), + }); + } + }, + "surfaceless" => match v { + "true" | "" => { + gpu_params.renderer_use_surfaceless = true; + } + "false" => { + gpu_params.renderer_use_surfaceless = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'surfaceless' should be a boolean"), + }); + } + }, + #[cfg(feature = "gfxstream")] + "syncfd" => { + syncfd_specified = true; + match v { + "true" | "" => { + gpu_params.gfxstream_use_syncfd = true; + } + "false" => { + gpu_params.gfxstream_use_syncfd = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'syncfd' should be a boolean"), + }); + } + } + } + #[cfg(feature = "gfxstream")] + "angle" => { + angle_specified = true; + match v { + "true" | "" => { + gpu_params.gfxstream_use_guest_angle = true; + } + "false" => { + gpu_params.gfxstream_use_guest_angle = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'angle' should be a boolean"), + }); + } + } + } + "vulkan" => { + #[cfg(feature = "gfxstream")] + { + vulkan_specified = true; + } + match v { + "true" | "" => { + gpu_params.use_vulkan = true; + } + "false" => { + gpu_params.use_vulkan = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'vulkan' should be a boolean"), + }); + } + } + } + #[cfg(feature = "gfxstream")] + "gles3.1" => { + gles31_specified = true; + match v { + "true" | "" => { + gpu_params.gfxstream_support_gles31 = true; + } + "false" => { + gpu_params.gfxstream_support_gles31 = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'gles3.1' should be a boolean"), + }); + } + } + } + "wsi" => match v { + "vk" => { + gpu_params.wsi = Some(RutabagaWsi::Vulkan); + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'wsi' should be vk"), + }); + } + }, + "width" => { + if let Some(width) = width { + return Err(argument::Error::TooManyArguments(format!( + "width was already specified: {}", + width + ))); + } + width = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'width' must be a valid integer"), + })?, + ); + } + "height" => { + if let Some(height) = height { + return Err(argument::Error::TooManyArguments(format!( + "height was already specified: {}", + height + ))); + } + height = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "gpu parameter 'height' must be a valid integer", + ), + })?, + ); + } + "dpi" => { + if let Some(dpi) = dpi { + return Err(argument::Error::TooManyArguments(format!( + "dpi was already specified: {}", + dpi + ))); + } + dpi = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'dpi' must be a valid integer"), + })?, + ); + } + #[cfg(feature = "gfxstream")] + "refresh_rate" => { + if let Some(vsync) = vsync { + return Err(argument::Error::TooManyArguments(format!( + "refresh_rate was already specified: {}", + vsync + ))); + } + vsync = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "gpu parameter 'refresh_rate' must be a valid integer", + ), + })?, + ); + } + "display_mode" => { + if let Some(display_mode) = display_mode { + return Err(argument::Error::TooManyArguments(format!( + "display_mode was already specified: {}", + display_mode + ))); + } + display_mode = Some(String::from(v)); + } + "hidden" => match v { + "true" | "" => { + hidden = Some(true); + } + "false" => { + hidden = Some(false); + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'hidden' should be a boolean"), + }); + } + }, + "cache-path" => gpu_params.cache_path = Some(v.to_string()), + "cache-size" => gpu_params.cache_size = Some(v.to_string()), + "udmabuf" => match v { + "true" | "" => { + gpu_params.udmabuf = true; + } + "false" => { + gpu_params.udmabuf = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'udmabuf' should be a boolean"), + }); + } + }, + "context-types" => { + let context_types: Vec = v.split(':').map(|s| s.to_string()).collect(); + gpu_params.context_mask = calculate_context_mask(context_types); + } + "" => {} + _ => { + return Err(argument::Error::UnknownArgument(format!( + "gpu parameter {}", + k + ))); + } + } + } + + match display_mode.as_deref() { + Some("windowed") => gpu_params.display_params = GpuDisplayParameters::default_windowed(), + Some("borderless_full_screen") => gpu_params.display_params = GpuDisplayParameters::default_borderless_full_screen(), + None => {} + Some(display_mode) => return Err(argument::Error::InvalidValue { + value: display_mode.to_string(), + expected: String::from("gpu parameter 'display_mode' must be either 'borderless_full_screen' or 'windowed'") + }) + } + + if let Some(hidden) = hidden { + gpu_params.display_params.hidden = hidden; + } + + #[cfg(feature = "gfxstream")] + { + if let Some(vsync) = vsync { + gpu_params.vsync = vsync; + } + } + + match gpu_params.display_params.display_mode { + GpuDisplayMode::Windowed { + width: ref mut width_in_params, + height: ref mut height_in_params, + dpi: ref mut dpi_in_params, + } => { + if let Some(width) = width { + *width_in_params = width; + } + if let Some(height) = height { + *height_in_params = height; + } + if let Some(dpi) = dpi { + *dpi_in_params = dpi; + } + } + GpuDisplayMode::BorderlessFullScreen(_) => { + if width.is_some() || height.is_some() || dpi.is_some() { + return Err(argument::Error::UnknownArgument( + "width, height, or dpi is only supported for windowed display mode".to_string(), + )); + } + } + } + + #[cfg(feature = "gfxstream")] + { + if !vulkan_specified && gpu_params.mode == GpuMode::ModeGfxstream { + gpu_params.use_vulkan = crate::crosvm::sys::config::use_vulkan(); + } + if syncfd_specified || angle_specified || gles31_specified { + match gpu_params.mode { + GpuMode::ModeGfxstream => {} + _ => { + return Err(argument::Error::UnknownArgument( + "gpu parameters syncfd and gles3.1 are only supported for gfxstream backend" + .to_string(), + )); + } + } + } + } + + Ok(gpu_params) +} + +#[cfg(feature = "gpu")] +pub(crate) fn validate_gpu_config(cfg: &mut Config) -> Result<(), String> { + if let Some(gpu_parameters) = cfg.gpu_parameters.as_ref() { + let (width, height) = gpu_parameters.display_params.get_virtual_display_size(); + for virtio_multi_touch in cfg.virtio_multi_touch.iter_mut() { + virtio_multi_touch.set_default_size(width, height); + } + for virtio_single_touch in cfg.virtio_single_touch.iter_mut() { + virtio_single_touch.set_default_size(width, height); + } + + let dpi = gpu_parameters.display_params.get_dpi(); + info!("using dpi {} on the Android guest", dpi); + cfg.params.push(format!("androidboot.lcd_density={}", dpi)); + } + Ok(()) +} + +/// Each type of process should have its own type here. This affects both exit +/// handling and sandboxing policy. +/// +/// WARNING: do NOT change the values items in this enum. The enum value is used in our exit codes, +/// and relied upon by metrics analysis. The max value for this enum is 0x1F = 31 as it is +/// restricted to five bits per `crate::crosvm::sys::windows::exit::to_process_type_error`. +#[derive(Clone, Copy, PartialEq, Debug, enumn::N)] +#[repr(u8)] +pub enum ProcessType { + Block = 1, + Main = 2, + Metrics = 3, + Net = 4, + Slirp = 5, +} + +impl From for EmulatorProcessType { + fn from(process_type: ProcessType) -> Self { + match process_type { + ProcessType::Block => EmulatorProcessType::PROCESS_TYPE_BLOCK, + ProcessType::Main => EmulatorProcessType::PROCESS_TYPE_MAIN, + ProcessType::Metrics => EmulatorProcessType::PROCESS_TYPE_METRICS, + ProcessType::Net => EmulatorProcessType::PROCESS_TYPE_NET, + ProcessType::Slirp => EmulatorProcessType::PROCESS_TYPE_SLIRP, + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub enum IrqChipKind { + /// All interrupt controllers are emulated in the kernel. + Kernel, + /// APIC is emulated in the kernel. All other interrupt controllers are in userspace. + Split, + /// All interrupt controllers are emulated in userspace. + Userspace, +} + +impl FromStr for IrqChipKind { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "kernel" => Ok(Self::Kernel), + "split" => Ok(Self::Split), + "userspace" => Ok(Self::Userspace), + _ => Err("invalid irqchip kind: expected \"kernel\", \"split\", or \"userspace\""), + } + } +} + +/// Hypervisor backend. +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub enum HypervisorKind { + #[cfg(feature = "gvm")] + Gvm, + #[cfg(feature = "haxm")] + Haxm, + #[cfg(feature = "haxm")] + Ghaxm, + #[cfg(feature = "whpx")] + Whpx, +} + +impl FromStr for HypervisorKind { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + #[cfg(feature = "gvm")] + "gvm" => Ok(HypervisorKind::Gvm), + #[cfg(feature = "haxm")] + "haxm" => Ok(HypervisorKind::Haxm), + #[cfg(feature = "haxm")] + "ghaxm" => Ok(HypervisorKind::Ghaxm), + #[cfg(feature = "whpx")] + "whpx" => Ok(HypervisorKind::Whpx), + _ => Err("invalid hypervisor backend"), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[cfg(feature = "gpu")] + use crate::crosvm::sys::config::parse_gpu_options; + #[cfg(feature = "gpu")] + use devices::virtio::gpu::GpuDisplayMode; + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_gfxstream_with_syncfd_specified() { + { + let gpu_params: GpuParameters = + parse_gpu_options("backend=gfxstream,syncfd=true").unwrap(); + + assert!(gpu_params.gfxstream_use_syncfd); + } + { + let gpu_params: GpuParameters = + parse_gpu_options("syncfd=true,backend=gfxstream").unwrap(); + assert!(gpu_params.gfxstream_use_syncfd); + } + { + let gpu_params: GpuParameters = + parse_gpu_options("backend=gfxstream,syncfd=false").unwrap(); + + assert!(!gpu_params.gfxstream_use_syncfd); + } + { + let gpu_params: GpuParameters = + parse_gpu_options("syncfd=false,backend=gfxstream").unwrap(); + assert!(!gpu_params.gfxstream_use_syncfd); + } + { + assert!(parse_gpu_options("backend=gfxstream,syncfd=invalid_value").is_err()); + } + { + assert!(parse_gpu_options("syncfd=invalid_value,backend=gfxstream").is_err()); + } + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_not_gfxstream_with_syncfd_specified() { + { + assert!(parse_gpu_options("backend=virglrenderer,syncfd=true").is_err()); + } + { + assert!(parse_gpu_options("syncfd=true,backend=virglrenderer").is_err()); + } + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_gfxstream_with_wsi_specified() { + { + let gpu_params: GpuParameters = parse_gpu_options("backend=gfxstream,wsi=vk").unwrap(); + assert!(matches!(gpu_params.wsi, Some(RutabagaWsi::Vulkan))); + } + { + let gpu_params: GpuParameters = parse_gpu_options("wsi=vk,backend=gfxstream").unwrap(); + assert!(matches!(gpu_params.wsi, Some(RutabagaWsi::Vulkan))); + } + { + assert!(parse_gpu_options("backend=gfxstream,wsi=invalid_value").is_err()); + } + { + assert!(parse_gpu_options("wsi=invalid_value,backend=gfxstream").is_err()); + } + } + + #[cfg(feature = "audio")] + #[test] + fn parse_ac97_vaild() { + crate::crosvm::config::parse_ac97_options("backend=win_audio") + .expect("parse should have succeded"); + } + + #[cfg(all(feature = "gpu"))] + #[test] + fn parse_gpu_options_default_vulkan_support() { + #[cfg(unix)] + assert!( + !parse_gpu_options("backend=virglrenderer") + .unwrap() + .use_vulkan + ); + #[cfg(feature = "gfxstream")] + assert!(!parse_gpu_options("backend=gfxstream").unwrap().use_vulkan); + #[cfg(all(feature = "gfxstream", unix))] + assert!(parse_gpu_options("backend=gfxstream").unwrap().use_vulkan); + } + + #[cfg(all(feature = "gpu"))] + #[test] + fn parse_gpu_options_with_vulkan_specified() { + assert!(parse_gpu_options("vulkan=true").unwrap().use_vulkan); + #[cfg(unix)] + assert!( + parse_gpu_options("backend=virglrenderer,vulkan=true") + .unwrap() + .use_vulkan + ); + #[cfg(unix)] + assert!( + parse_gpu_options("vulkan=true,backend=virglrenderer") + .unwrap() + .use_vulkan + ); + assert!(!parse_gpu_options("vulkan=false").unwrap().use_vulkan); + #[cfg(unix)] + assert!( + !parse_gpu_options("backend=virglrenderer,vulkan=false") + .unwrap() + .use_vulkan + ); + #[cfg(unix)] + assert!( + !parse_gpu_options("vulkan=false,backend=virglrenderer") + .unwrap() + .use_vulkan + ); + #[cfg(unix)] + assert!(parse_gpu_options("backend=virglrenderer,vulkan=invalid_value").is_err()); + assert!(parse_gpu_options("vulkan=invalid_value,backend=virglrenderer").is_err()); + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_gfxstream_with_gles31_specified() { + assert!( + parse_gpu_options("backend=gfxstream,gles3.1=true") + .unwrap() + .gfxstream_support_gles31 + ); + assert!( + parse_gpu_options("gles3.1=true,backend=gfxstream") + .unwrap() + .gfxstream_support_gles31 + ); + assert!( + !parse_gpu_options("backend=gfxstream,gles3.1=false") + .unwrap() + .gfxstream_support_gles31 + ); + assert!( + !parse_gpu_options("gles3.1=false,backend=gfxstream") + .unwrap() + .gfxstream_support_gles31 + ); + assert!(parse_gpu_options("backend=gfxstream,gles3.1=invalid_value").is_err()); + assert!(parse_gpu_options("gles3.1=invalid_value,backend=gfxstream").is_err()); + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_not_gfxstream_with_gles31_specified() { + assert!(parse_gpu_options("backend=virglrenderer,gles3.1=true").is_err()); + assert!(parse_gpu_options("gles3.1=true,backend=virglrenderer").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_gpu_display_mode() { + let display_params = parse_gpu_options("display_mode=windowed") + .unwrap() + .display_params; + assert!(matches!( + display_params.display_mode, + GpuDisplayMode::Windowed { .. } + )); + + let display_params = parse_gpu_options("display_mode=borderless_full_screen") + .unwrap() + .display_params; + assert!(matches!( + display_params.display_mode, + GpuDisplayMode::BorderlessFullScreen(_) + )); + + assert!(parse_gpu_options("display_mode=invalid_mode").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_gpu_display_mode_duplicated() { + assert!(parse_gpu_options("display_mode=windowed,display_mode=windowed").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_borderless_full_screen_shouldnt_be_specified_with_size() { + assert!(parse_gpu_options("display_mode=borderless_full_screen,width=1280").is_err()); + assert!(parse_gpu_options("display_mode=borderless_full_screen,height=720").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_windowed_with_size() { + const WIDTH: u32 = 1720; + const HEIGHT: u32 = 1800; + const DPI: u32 = 1808; + + let display_params = + parse_gpu_options(format!("display_mode=windowed,width={}", WIDTH).as_str()) + .unwrap() + .display_params; + assert!( + matches!(display_params.display_mode, GpuDisplayMode::Windowed { width, .. } if width == WIDTH) + ); + + let display_params = + parse_gpu_options(format!("display_mode=windowed,height={}", HEIGHT).as_str()) + .unwrap() + .display_params; + assert!( + matches!(display_params.display_mode, GpuDisplayMode::Windowed { height, .. } if height == HEIGHT) + ); + + let display_params = + parse_gpu_options(format!("display_mode=windowed,dpi={}", DPI).as_str()) + .unwrap() + .display_params; + assert!( + matches!(display_params.display_mode, GpuDisplayMode::Windowed { dpi, .. } if dpi == DPI) + ); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_hidden() { + let display_params = parse_gpu_options(format!("hidden=true").as_str()) + .unwrap() + .display_params; + assert!(display_params.hidden); + + let display_params = parse_gpu_options(format!("hidden=false").as_str()) + .unwrap() + .display_params; + assert!(matches!(display_params.hidden, false)); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_size_duplicated() { + assert!(parse_gpu_options("width=1280,width=1280").is_err()); + assert!(parse_gpu_options("height=1280,height=1280").is_err()); + assert!(parse_gpu_options("dpi=1280,dpi=1280").is_err()); + } +} diff --git a/src/crosvm/sys/windows/exit.rs b/src/crosvm/sys/windows/exit.rs new file mode 100644 index 0000000000..c5e82ed35f --- /dev/null +++ b/src/crosvm/sys/windows/exit.rs @@ -0,0 +1,489 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! Enum and Anyhow helpers to set the process exit code. + +use std::fmt::{self, Display, Formatter}; + +use crate::crosvm::sys::config::ProcessType; +use anyhow::Context; + +pub type ExitCode = i32; + +#[derive(Debug)] +pub struct ExitCodeWrapper(pub ExitCode); + +impl Display for ExitCodeWrapper { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "exit code: {} = 0x{:08x}", self.0, self.0) + } +} + +/// Trait for attaching context with process exit codes to a std::result::Result. +pub trait ExitContext { + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into; + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static; + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C; +} + +impl ExitContext for std::result::Result +where + E: std::error::Error + Send + Sync + 'static, +{ + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into, + { + self.context(ExitCodeWrapper(exit_code.into())) + } + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + { + self.context(ExitCodeWrapper(exit_code.into())) + .context(context) + } + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C, + { + self.context(ExitCodeWrapper(exit_code.into())) + .with_context(f) + } +} + +/// Trait for attaching context with process exit codes to an anyhow::Result. +pub trait ExitContextAnyhow { + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into; + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static; + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C; + + fn to_exit_code(&self) -> Option; +} + +impl ExitContextAnyhow for anyhow::Result { + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into, + { + self.context(ExitCodeWrapper(exit_code.into())) + } + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + { + self.context(ExitCodeWrapper(exit_code.into())) + .context(context) + } + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C, + { + self.context(ExitCodeWrapper(exit_code.into())) + .with_context(f) + } + + fn to_exit_code(&self) -> Option { + self.as_ref() + .err() + .map(|e| e.downcast_ref::()) + .flatten() + .map(|w| w.0) + } +} + +#[macro_export] +macro_rules! bail_exit_code { + ($exit_code:literal, $msg:literal $(,)?) => { + return Err(anyhow!($msg)).exit_code($exit_code) + }; + ($exit_code:literal, $err:expr $(,)?) => { + return Err(anyhow!($err)).exit_code($exit_code) + }; + ($exit_code:literal, $fmt:expr, $($arg:tt)*) => { + return Err(anyhow!($fmt, $($arg)*)).exit_code($exit_code) + }; + ($exit_code:expr, $msg:literal $(,)?) => { + return Err(anyhow!($msg)).exit_code($exit_code) + }; + ($exit_code:expr, $err:expr $(,)?) => { + return Err(anyhow!($err)).exit_code($exit_code) + }; + ($exit_code:expr, $fmt:expr, $($arg:tt)*) => { + return Err(anyhow!($fmt, $($arg)*)).exit_code($exit_code) + }; +} + +#[macro_export] +macro_rules! ensure_exit_code { + ($cond:expr, $exit_code:literal $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, concat!("Condition failed: `", stringify!($cond), "`")); + } + }; + ($cond:expr, $exit_code:literal, $msg:literal $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $msg); + } + }; + ($cond:expr, $exit_code:literal, $err:expr $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $err); + } + }; + ($cond:expr, $exit_code:literal, $fmt:expr, $($arg:tt)*) => { + if !$cond { + bail_exit_code!($exit_code, $fmt, $($arg)*); + } + }; + ($cond:expr, $exit_code:expr $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, concat!("Condition failed: `", stringify!($cond), "`")); + } + }; + ($cond:expr, $exit_code:expr, $msg:literal $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $msg); + } + }; + ($cond:expr, $exit_code:expr, $err:expr $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $err); + } + }; + ($cond:expr, $exit_code:expr, $fmt:expr, $($arg:tt)*) => { + if !$cond { + bail_exit_code!($exit_code, $fmt, $($arg)*); + } + }; +} + +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Exit { + // Windows process exit codes triggered by the kernel tend to be NTSTATUS, so we treat + // our error codes as NTSTATUS to avoid clashing. This means we set the vendor bit. We also + // set the severity to error. As these all set in the MSB, we can write this as a prefix of + // 0xE0. + // + // Because of how these error codes are used in CommandType, we can only use the lower two + // bytes of the u32 for our error codes; in other words, the legal range is + // [0xE0000000, 0xE000FFFF]. + AddGpuDeviceMemory = 0xE0000001, + AddIrqChipVcpu = 0xE0000002, + AddPmemDeviceMemory = 0xE0000003, + AllocateGpuDeviceAddress = 0xE0000004, + AllocatePmemDeviceAddress = 0xE0000005, + BlockDeviceNew = 0xE0000006, + BuildVm = 0xE0000007, + ChownTpmStorage = 0xE0000008, + CloneEvent = 0xE000000A, + CloneVcpu = 0xE000000B, + ConfigureVcpu = 0xE000000C, + CreateAc97 = 0xE000000D, + CreateConsole = 0xE000000E, + CreateDisk = 0xE000000F, + CreateEvent = 0xE0000010, + CreateGralloc = 0xE0000011, + CreateGvm = 0xE0000012, + CreateSocket = 0xE0000013, + CreateTapDevice = 0xE0000014, + CreateTimer = 0xE0000015, + CreateTpmStorage = 0xE0000016, + CreateVcpu = 0xE0000017, + CreateWaitContext = 0xE0000018, + Disk = 0xE0000019, + DiskImageLock = 0xE000001A, + DropCapabilities = 0xE000001B, + EventDeviceSetup = 0xE000001C, + EnableHighResTimer = 0xE000001D, + HandleCreateQcowError = 0xE000001E, + HandleVmRequestError = 0xE0000020, + InitSysLogError = 0xE0000021, + InputDeviceNew = 0xE0000022, + InputEventsOpen = 0xE0000023, + InvalidRunArgs = 0xE0000025, + InvalidSubCommand = 0xE0000026, + InvalidSubCommandArgs = 0xE0000027, + InvalidWaylandPath = 0xE0000028, + LoadKernel = 0xE0000029, + MissingCommandArg = 0xE0000030, + ModifyBatteryError = 0xE0000031, + NetDeviceNew = 0xE0000032, + OpenAcpiTable = 0xE0000033, + OpenAndroidFstab = 0xE0000034, + OpenBios = 0xE0000035, + OpenInitrd = 0xE0000036, + OpenKernel = 0xE0000037, + OpenVinput = 0xE0000038, + PivotRootDoesntExist = 0xE0000039, + PmemDeviceImageTooBig = 0xE000003A, + PmemDeviceNew = 0xE000003B, + ReadMemAvailable = 0xE000003C, + RegisterBalloon = 0xE000003D, + RegisterBlock = 0xE000003E, + RegisterGpu = 0xE000003F, + RegisterNet = 0xE0000040, + RegisterP9 = 0xE0000041, + RegisterRng = 0xE0000042, + RegisterWayland = 0xE0000043, + ReserveGpuMemory = 0xE0000044, + ReserveMemory = 0xE0000045, + ReservePmemMemory = 0xE0000046, + ResetTimer = 0xE0000047, + RngDeviceNew = 0xE0000048, + RunnableVcpu = 0xE0000049, + SettingSignalMask = 0xE000004B, + SpawnVcpu = 0xE000004D, + SysUtil = 0xE000004E, + Timer = 0xE000004F, + ValidateRawDescriptor = 0xE0000050, + VirtioPciDev = 0xE0000051, + WaitContextAdd = 0xE0000052, + WaitContextDelete = 0xE0000053, + WhpxSetupError = 0xE0000054, + VcpuFailEntry = 0xE0000055, + VcpuRunError = 0xE0000056, + VcpuShutdown = 0xE0000057, + VcpuSystemEvent = 0xE0000058, + WaitUntilRunnable = 0xE0000059, + CreateControlServer = 0xE000005A, + CreateTube = 0xE000005B, + UsbError = 0xE000005E, + GuestMemoryLayout = 0xE000005F, + CreateVm = 0xE0000060, + CreateGuestMemory = 0xE0000061, + CreateIrqChip = 0xE0000062, + SpawnIrqThread = 0xE0000063, + ConnectTube = 0xE0000064, + BalloonDeviceNew = 0xE0000065, + BalloonStats = 0xE0000066, + BorrowVfioContainer = 0xE0000067, + OpenCompositeFooterFile = 0xE0000068, + OpenCompositeHeaderFile = 0xE0000069, + OpenCompositeImageFile = 0xE0000070, + CreateCompositeDisk = 0xE0000071, + MissingControlTube = 0xE0000072, + TubeTransporterInit = 0xE0000073, + TubeFailure = 0xE0000074, + ProcessSpawnFailed = 0xE0000075, + LogFile = 0xE0000076, + CreateZeroFiller = 0xE0000077, + GenerateAcpi = 0xE0000078, + WaitContextWait = 0xE0000079, + SetSigintHandler = 0xE000007A, + KilledBySignal = 0xE000007B, + BrokerDeviceExitedTimeout = 0xE000007C, + BrokerMainExitedTimeout = 0xE000007D, + MemoryTooLarge = 0xE000007E, + BrokerMetricsExitedTimeout = 0xE000007F, + MetricsController = 0xE0000080, + SwiotlbTooLarge = 0xE0000081, + UserspaceVsockDeviceNew = 0xE0000082, + VhostUserBlockDeviceNew = 0xE0000083, + CrashReportingInit = 0xE0000084, + StartBackendDevice = 0xE0000085, + ConfigureHotPlugDevice = 0xE0000086, + InvalidHotPlugKey = 0xE0000087, + InvalidVfioPath = 0xE0000088, + NoHotPlugBus = 0xE0000089, + SandboxError = 0xE000008A, + Pstore = 0xE000008B, + ProcessInvariantsInit = 0xE000008C, + VirtioVhostUserDeviceNew = 0xE000008D, + CloneTube = 0xE000008E, + VhostUserGpuDeviceNew = 0xE000008F, + CreateAsyncDisk = 0xE0000090, + CreateDiskCheckAsyncOkError = 0xE0000091, + VhostUserNetDeviceNew = 0xE0000092, + BrokerSigtermTimeout = 0xE0000093, + SpawnVcpuMonitor = 0xE0000094, + NoDefaultHypervisor = 0xE0000095, + TscCalibrationFailed = 0xE0000096, + UnknownError = 0xE0000097, + CommonChildSetupError = 0xE0000098, +} + +impl From for ExitCode { + fn from(exit: Exit) -> Self { + exit as ExitCode + } +} + +// Bitfield masks for NTSTATUS & our extension of the format. See to_process_type_error for details. +mod bitmasks { + pub const FACILITY_FIELD_LOWER_MASK: u32 = u32::from_be_bytes([0x00, 0x3F, 0x00, 0x00]); + pub const EXTRA_DATA_FIELD_MASK: u32 = u32::from_be_bytes([0x0F, 0xC0, 0x00, 0x00]); + #[cfg(test)] + pub const EXTRA_DATA_FIELD_COMMAND_TYPE_MASK: u32 = + u32::from_be_bytes([0x07, 0xC0, 0x00, 0x00]); + pub const EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK: u32 = + u32::from_be_bytes([0x08, 0x00, 0x00, 0x00]); + pub const VENDOR_FIELD_MASK: u32 = u32::from_be_bytes([0x20, 0x00, 0x00, 0x00]); + pub const RESERVED_BIT_MASK: u32 = u32::from_be_bytes([0x10, 0x00, 0x00, 0x00]); + pub const COMMAND_TYPE_MASK: u32 = u32::from_be_bytes([0x00, 0x00, 0x00, 0x1F]); +} +use bitmasks::*; + +/// If you are looking for a fun interview question, you have come to the right place. To +/// understand the details of NTSTATUS, which you'll want to do before reading further, visit +/// https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-erref/87fba13e-bf06-450e-83b1-9241dc81e781. +/// +/// This function is unfortunately what happens when you only have six bits to store auxiliary +/// information, and have to fit in with an existing bitfield's schema. +/// +/// This function packs bits in NTSTATUS results (generally what a Windows exit code should be). +/// There are three primary cases it deals with: +/// * Vendor specific exits. These are error codes we generate explicitly in crosvm. We will +/// pack these codes with the lower 6 "facility" bits set so they can't collide with the other +/// cases. The MSB of the facility field will be clear. +/// +/// * Non vendor NTSTATUS exits. These are error codes which come from Windows. We flip the +/// vendor bit on these because we're going to pack the facility field, and leaving it unset +/// would cause us to violate the rule that if the vendor bit is unset, we shouldn't exceed +/// FACILITY_MAXIMUM_VALUE in that field. The MSB of the facility field will be clear. +/// +/// * Non NTSTATUS errors. We detect these with two heuristics: +/// a) Reserved field is set. +/// b) The facility field has exceeded the bottom six bits. +/// +/// For such cases, we pack as much of the error as we can into the lower 6 bits of the +/// facility field, and code field (2 bytes). In this case, the most significant bit of the +/// facility field is set. +/// +/// For all of the cases above, we pack the most significant 5 bits of the facility field with +/// information about what command type generated this error. +pub fn to_process_type_error(error_code: u32, cmd_type: ProcessType) -> u32 { + let is_vendor = error_code & VENDOR_FIELD_MASK != 0; + + // The reserved bit is always clear on a NTSTATUS code. + let is_reserved_bit_clear = error_code & RESERVED_BIT_MASK == 0; + + // The six most significant bits of the facility field are where we'll be storing our + // command type (and whether we have a valid NTSTATUS error). If bits are already set there, + // it means this isn't a valid NTSTATUS code. + let is_extra_data_field_clear = error_code & EXTRA_DATA_FIELD_MASK == 0; + + let is_ntstatus = is_reserved_bit_clear && is_extra_data_field_clear; + + // We use the top bit of the facility field to store whether we ran out of space to pack + // the error. The next five bits are where we store the command type, so we'll shift them + // into the appropriate position here. + let command_type = (cmd_type as u32 & COMMAND_TYPE_MASK) << 22; + + match (is_ntstatus, is_vendor) { + // Valid vendor code + (true, true) => { + // Set all the lower facility bits, and attach the command type. + error_code | FACILITY_FIELD_LOWER_MASK | command_type + } + + // Valid non-vendor code + (true, false) => { + // Set the vendor bit and attach the command type. + error_code | VENDOR_FIELD_MASK | command_type + } + + // Not a valid NTSTATUS code. + _ => { + // Clear the extra data field, and set the the top bit of the facility field to + // signal that we didn't have enough space for the full error codes. + error_code & !EXTRA_DATA_FIELD_MASK | command_type | EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use winapi::shared::ntstatus::STATUS_BAD_INITIAL_PC; + + #[test] + fn test_to_process_type_error_ntstatus_vendor() { + let e = to_process_type_error(Exit::InvalidRunArgs as u32, ProcessType::Main); + assert_eq!( + e & EXTRA_DATA_FIELD_COMMAND_TYPE_MASK, + (ProcessType::Main as u32) << 22 + ); + assert_eq!(e & EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK, 0); + + // This is a valid NTSTATUS error. + assert_eq!(e & RESERVED_BIT_MASK, 0); + + // Check the actual crosvm error code contained in the NTSTATUS. We don't mutate the + // severity field, so we don't mask it off. We mask off the facility field entirely because + // that's where we stored the command type & NTSTATUS validity bit. + assert_eq!(e & 0xF000FFFF_u32, Exit::InvalidRunArgs as u32); + } + + #[test] + fn test_to_process_type_error_ntstatus_non_vendor() { + let e = to_process_type_error(STATUS_BAD_INITIAL_PC as u32, ProcessType::Main); + assert_eq!( + e & EXTRA_DATA_FIELD_COMMAND_TYPE_MASK, + (ProcessType::Main as u32) << 22 + ); + assert_eq!(e & EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK, 0); + + // This is a valid NTSTATUS error. + assert_eq!(e & RESERVED_BIT_MASK, 0); + + // Check the actual error code contained in the NTSTATUS. We mask off all our extra data + // fields and switch off the vendor bit to confirm the actual code was left alone. + assert_eq!( + e & !EXTRA_DATA_FIELD_MASK & !VENDOR_FIELD_MASK, + STATUS_BAD_INITIAL_PC as u32 + ); + } + + #[test] + fn test_to_process_type_error_wontfit_ntstatus() { + let e = to_process_type_error(0xFFFFFFFF, ProcessType::Main); + assert_eq!( + e & EXTRA_DATA_FIELD_COMMAND_TYPE_MASK, + (ProcessType::Main as u32) << 22 + ); + + // -1 is not a valid NTSTATUS error. + assert_ne!(e & RESERVED_BIT_MASK, 0); + + // Overflow did occur. + assert_ne!(e & EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK, 0); + + // Check that we left the rest of the bits (except for our command type field & overflow + // bit) in the exit code untouched. + assert_eq!(e & 0xF03FFFFF_u32, 0xF03FFFFF_u32); + } +} diff --git a/src/crosvm/sys/windows/stats.rs b/src/crosvm/sys/windows/stats.rs new file mode 100644 index 0000000000..259852d172 --- /dev/null +++ b/src/crosvm/sys/windows/stats.rs @@ -0,0 +1,241 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::cmp::Reverse; +use std::fmt; +use std::time::{Duration, Instant}; + +use devices::BusStatistics; +use hypervisor::VcpuExit; + +const ERROR_RETRY_I32: i32 = winapi::shared::winerror::ERROR_RETRY as i32; + +/// Statistics about the number and duration of VM exits. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct VmExitStatistics { + /// Whether or not statistics have been enabled to measure VM exits. + enabled: bool, + /// Counter of the number of VM exits per-exit-type. The index into the Vec can be determined + /// from a &Result via the `exit_to_index` function. + exit_counters: Vec, + /// Sum of the duration of VM exits per-exit-type. The index into the Vec can be determined + /// from a &Result via the `exit_to_index` function. + exit_durations: Vec, +} + +impl VmExitStatistics { + pub fn new() -> VmExitStatistics { + VmExitStatistics { + enabled: false, + // We have a known number of exit types, and thus a known number of exit indices + exit_counters: vec![0; MAX_EXIT_INT + 1], + exit_durations: vec![Duration::new(0, 0); MAX_EXIT_INT + 1], + } + } + + /// Enable or disable statistics gathering. + pub fn set_enabled(&mut self, enabled: bool) { + self.enabled = enabled; + } + + /// Get the start time of the stat that is to be recorded. + /// + /// If the VmExitStatistics instance is not enabled this will return None. + pub fn start_stat(&self) -> Option { + if !self.enabled { + return None; + } + Some(Instant::now()) + } + + /// Record the end of the stat. + /// + /// The start value return from start_stat should be passed as `start`. If `start` is None or + /// if the VmExitStatistics instance is not enabled this will do nothing. The counters and + /// durations will silently overflow to prevent interference with vm operation. + pub fn end_stat(&mut self, exit: &base::Result, start: Option) { + if !self.enabled || start.is_none() { + return; + } + + let exit_index = exit_to_index(exit); + + // We overflow because we don't want any disruptions to emulator running due to + // statistics + self.exit_counters[exit_index] = self.exit_counters[exit_index].overflowing_add(1).0; + self.exit_durations[exit_index] = self.exit_durations[exit_index] + .checked_add(start.unwrap().elapsed()) + .unwrap_or(Duration::new(0, 0)); // If we overflow, reset to 0 + } + + /// Merge several VmExitStatistics into one. + pub fn merged(stats: &[VmExitStatistics]) -> VmExitStatistics { + let mut merged = VmExitStatistics::new(); + for other in stats.iter() { + for exit_index in 0..(MAX_EXIT_INT + 1) { + // We overflow because we don't want any disruptions to emulator running due to + // statistics + merged.exit_counters[exit_index] = merged.exit_counters[exit_index] + .overflowing_add(other.exit_counters[exit_index]) + .0; + merged.exit_durations[exit_index] = merged.exit_durations[exit_index] + .checked_add(other.exit_durations[exit_index]) + .unwrap_or(Duration::new(0, 0)); // If we overflow, reset to 0 + } + } + + merged + } + + /// Get a json representation of `self`. Returns an array of maps, where each map contains the + /// count and duration of a particular vmexit. + pub fn json(&self) -> serde_json::Value { + let mut exits = serde_json::json!([]); + let exits_vec = exits.as_array_mut().unwrap(); + for exit_index in 0..(MAX_EXIT_INT + 1) { + exits_vec.push(serde_json::json!({ + "exit_type": exit_index_to_str(exit_index), + "count": self.exit_counters[exit_index], + "duration": { + "seconds": self.exit_durations[exit_index].as_secs(), + "subsecond_nanos": self.exit_durations[exit_index].subsec_nanos(), + } + })) + } + exits + } +} + +impl std::fmt::Display for VmExitStatistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "Exit Type Count Duration")?; + + let mut exit_indices: Vec = (0..(MAX_EXIT_INT + 1)).collect(); + // Sort exit indices by exit_duration + exit_indices.sort_by_key(|i| Reverse(self.exit_durations[*i])); + + for exit_index in exit_indices { + writeln!( + f, + "{:<16}{:<16}{:<16}", + exit_index_to_str(exit_index), + self.exit_counters[exit_index], + // Alignment not implemented by Debug + format!("{:?}", self.exit_durations[exit_index]), + )?; + } + + Ok(()) + } +} + +/// This constant should be set to the maximum integer to which the below functions will map a +/// VcpuExit. +const MAX_EXIT_INT: usize = 13; + +/// Map Vm Exits to exit indexes, which are integers for storage in our counter Vecs. +fn exit_to_index(exit: &base::Result) -> usize { + match exit { + Ok(VcpuExit::Io { .. }) => 0, + Ok(VcpuExit::Mmio { .. }) => 1, + Ok(VcpuExit::IoapicEoi { .. }) => 2, + Ok(VcpuExit::IrqWindowOpen) => 3, + Ok(VcpuExit::Hlt) => 4, + Ok(VcpuExit::Shutdown) => 5, + Ok(VcpuExit::FailEntry { .. }) => 6, + Ok(VcpuExit::SystemEventShutdown) => 7, + Ok(VcpuExit::SystemEventReset) => 7, + Ok(VcpuExit::SystemEventCrash) => 7, + Ok(VcpuExit::Intr) => 8, + Ok(VcpuExit::Cpuid { .. }) => 9, + Err(e) if e.errno() == ERROR_RETRY_I32 => 10, + Err(_) => 11, + Ok(VcpuExit::Canceled) => 12, + _ => 13, + } +} + +/// Give human readable names for each exit type that we've mapped to an exit index in exit_to_index. +fn exit_index_to_str(exit: usize) -> String { + (match exit { + 0 => "Io", + 1 => "Mmio", + 2 => "IoapicEoi", + 3 => "IrqWindowOpen", + 4 => "Hlt", + 5 => "Shutdown", + 6 => "FailEntry", + 7 => "SystemEvent", + 8 => "Intr", + 9 => "Cpuid", + 10 => "Retry", + 11 => "Error", + 12 => "Canceled", + _ => "Unknown", + }) + .to_string() +} + +/// Collects, merges, and displays statistics between vcpu threads. +#[derive(Default, Clone, Debug)] +pub struct StatisticsCollector { + pub pio_bus_stats: Vec, + pub mmio_bus_stats: Vec, + pub vm_exit_stats: Vec, +} + +impl StatisticsCollector { + pub fn new() -> StatisticsCollector { + StatisticsCollector::default() + } + + /// Return a merged version of the pio bus statistics, mmio bus statistics, and the vm exit + /// statistics for all vcpus. + fn merged(&self) -> (BusStatistics, BusStatistics, VmExitStatistics) { + ( + BusStatistics::merged(&self.pio_bus_stats), + BusStatistics::merged(&self.mmio_bus_stats), + VmExitStatistics::merged(&self.vm_exit_stats), + ) + } + + /// Get a json representation of `self`. It contains two top-level keys: "vcpus" and "merged". + /// The "vcpus" key's value is a list of per-vcpu stats, where the "merged" stats contains the + /// sum of all vcpu stats. + pub fn json(&self) -> serde_json::Value { + let mut vcpus = serde_json::json!([]); + let vcpus_vec = vcpus.as_array_mut().unwrap(); + + for i in 0..self.pio_bus_stats.len() { + vcpus_vec.push(serde_json::json!({ + "io": self.pio_bus_stats[i].json(), + "mmio": self.mmio_bus_stats[i].json(), + "exits": self.vm_exit_stats[i].json(), + })); + } + + let (pio, mmio, exits) = self.merged(); + + serde_json::json!({ + "merged": { + "io": pio.json(), + "mmio": mmio.json(), + "exits": exits.json(), + }, + "vcpus": vcpus + }) + } +} + +impl std::fmt::Display for StatisticsCollector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let (pio, mmio, exits) = self.merged(); + writeln!(f, "Port IO:")?; + writeln!(f, "{}", pio)?; + writeln!(f, "MMIO:")?; + writeln!(f, "{}", mmio)?; + writeln!(f, "Vm Exits:")?; + writeln!(f, "{}", exits) + } +} diff --git a/src/main.rs b/src/main.rs index 81a0b48aea..1d810a6d89 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,8 +32,11 @@ use vm_control::{ BalloonControlCommand, DiskControlCommand, UsbControlResult, VmRequest, VmResponse, }; +use crate::sys::error_to_exit_code; use crate::sys::init_log; use crosvm::cmdline::{Command, CrossPlatformCommands, CrossPlatformDevicesCommands}; +#[cfg(windows)] +use sys::windows::metrics; #[cfg(feature = "scudo")] #[global_allocator] @@ -91,6 +94,12 @@ where } } Ok(cfg) => { + #[cfg(feature = "crash-report")] + crosvm::sys::setup_emulator_crash_reporting(&cfg)?; + + #[cfg(windows)] + metrics::setup_metrics_reporting()?; + init_log(log_config, &cfg)?; let exit_state = crate::sys::run_config(cfg); to_command_status(exit_state) @@ -463,9 +472,17 @@ fn prepare_argh_args>(args_iter: I) -> Vec Result { + let _library_watcher = sys::get_library_watcher(); + + // The following panic hook will stop our crashpad hook on windows. + // Only initialize when the crash-pad feature is off. #[cfg(not(feature = "crash-report"))] sys::set_panic_hook(); + // Ensure all processes detach from metrics on exit. + #[cfg(windows)] + let _metrics_destructor = metrics::get_destructor(); + let args = prepare_argh_args(std::env::args()); let args = args.iter().map(|s| s.as_str()).collect::>(); let args = match crosvm::cmdline::CrosvmCmdlineArgs::from_args(&args[..1], &args[1..]) { @@ -493,6 +510,16 @@ fn crosvm_main() -> Result { // We handle run_vm separately because it does not simply signal success/error // but also indicates whether the guest requested reset or stop. run_vm(cmd, log_config) + } else if let CrossPlatformCommands::Device(cmd) = command { + // On windows, the device command handles its own logging setup, so we can't handle it below + // otherwise logging will double init. + if cfg!(unix) { + syslog::init_with(log_config) + .map_err(|e| anyhow!("failed to initialize syslog: {}", e))?; + } + start_device(cmd) + .map_err(|_| anyhow!("start_device subcommand failed")) + .map(|_| CommandStatus::Success) } else { syslog::init_with(log_config) .map_err(|e| anyhow!("failed to initialize syslog: {}", e))?; @@ -513,9 +540,7 @@ fn crosvm_main() -> Result { CrossPlatformCommands::CreateQcow2(cmd) => { create_qcow2(cmd).map_err(|_| anyhow!("create_qcow2 subcommand failed")) } - CrossPlatformCommands::Device(cmd) => { - start_device(cmd).map_err(|_| anyhow!("start_device subcommand failed")) - } + CrossPlatformCommands::Device(_) => unreachable!(), CrossPlatformCommands::Disk(cmd) => { disk_cmd(cmd).map_err(|_| anyhow!("disk subcommand failed")) } @@ -590,7 +615,7 @@ fn main() { 34 } Err(e) => { - let exit_code = 1; + let exit_code = error_to_exit_code(&res); error!("exiting with error {}:{:?}", exit_code, e); exit_code } @@ -611,6 +636,8 @@ mod tests { assert!(!is_flag("no-leading-dash")); } + // TODO(b/238361778) this doesn't work on Windows because is_flag isn't called yet. + #[cfg(unix)] #[test] fn args_split_long() { assert_eq!( @@ -621,6 +648,8 @@ mod tests { ); } + // TODO(b/238361778) this doesn't work on Windows because is_flag isn't called yet. + #[cfg(unix)] #[test] fn args_split_short() { assert_eq!( diff --git a/src/sys.rs b/src/sys.rs index 7b8f1cf639..0ea17bf646 100644 --- a/src/sys.rs +++ b/src/sys.rs @@ -7,12 +7,22 @@ cfg_if::cfg_if! { pub(crate) mod unix; use unix as platform; pub(crate) use crate::crosvm::sys::unix::{run_config, ExitState}; + } else if #[cfg(windows)] { + pub(crate) mod windows; + use windows as platform; + pub(crate) use windows::ExitState; + pub(crate) use windows::run_config; } else { compile_error!("Unsupported platform"); } } -pub(crate) use platform::main::{cleanup, init_log, run_command, start_device}; +pub(crate) use platform::main::{ + cleanup, error_to_exit_code, get_library_watcher, init_log, run_command, start_device, +}; + +#[cfg(feature = "kiwi")] +pub(crate) use platform::main::sandbox_lower_token; #[cfg(not(feature = "crash-report"))] pub(crate) use platform::set_panic_hook; diff --git a/src/sys/unix/main.rs b/src/sys/unix/main.rs index 383bed5079..99815cf427 100644 --- a/src/sys/unix/main.rs +++ b/src/sys/unix/main.rs @@ -16,7 +16,7 @@ use devices::virtio::vhost::user::device::{ use crate::{ crosvm::sys::cmdline::{Commands, DevicesSubcommand}, - Config, + CommandStatus, Config, }; pub(crate) fn start_device(command: DevicesSubcommand) -> anyhow::Result<()> { @@ -74,6 +74,10 @@ pub(crate) fn cleanup() { } } +pub fn get_library_watcher() -> std::io::Result<()> { + Ok(()) +} + pub(crate) fn run_command(_cmd: Commands) -> anyhow::Result<()> { Err(anyhow::anyhow!("invalid command")) } @@ -88,3 +92,7 @@ where } Ok(()) } + +pub(crate) fn error_to_exit_code(_res: &std::result::Result) -> i32 { + 1 +} diff --git a/src/sys/windows.rs b/src/sys/windows.rs new file mode 100644 index 0000000000..5b3f8431e3 --- /dev/null +++ b/src/sys/windows.rs @@ -0,0 +1,2078 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +pub(crate) mod irq_wait; +pub(crate) mod main; +pub(crate) mod metrics; +#[cfg(not(feature = "crash-report"))] +mod panic_hook; +pub(crate) mod run_vcpu; + +use irq_wait::IrqWaitWorker; +#[cfg(not(feature = "crash-report"))] +pub(crate) use panic_hook::set_panic_hook; +use run_vcpu::{run_all_vcpus, VcpuRunMode}; + +use crate::crosvm::config::{Config, Executable}; +use crate::crosvm::sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}; +use crate::crosvm::sys::windows::stats::StatisticsCollector; + +use crate::sys::windows::metrics::{log_descriptor, MetricEventType}; +use acpi_tables::sdt::SDT; +#[cfg(all(feature = "kiwi", feature = "anti-tamper",))] +use anti_tamper::spawn_dedicated_anti_tamper_thread; +#[cfg(feature = "kiwi")] +use anyhow::ensure; +use anyhow::{anyhow, bail, Context, Result}; +use arch::{self, LinuxArch, RunnableLinuxVm, VirtioDeviceStub, VmComponents, VmImage}; +#[cfg(feature = "kiwi")] +use base::give_foregrounding_permission; +use base::{ + self, enable_high_res_timers, error, info, warn, Event, EventToken, ExternalMapping, + FromRawDescriptor, RawDescriptor, ReadNotifier, RecvTube, SendTube, Tube, TubeError, + VmEventType, WaitContext, +}; +use devices::serial_device::{SerialHardware, SerialParameters}; +use devices::virtio::block::block::DiskOption; +use devices::virtio::{self, BalloonMode, Console, PvClock}; +use devices::Minijail; +use devices::{ + self, get_tsc_sync_mitigations, standard_deviation, Ac97Dev, BusDeviceObj, TscSyncMitigations, + UserspaceIrqChip, VirtioPciDevice, +}; +#[cfg(feature = "haxm")] +use hypervisor::haxm::{get_use_ghaxm, set_use_ghaxm, Haxm, HaxmVcpu, HaxmVm}; +use hypervisor::{ProtectionType, Vm}; +use resources::SystemAllocator; +use rutabaga_gfx::RutabagaGralloc; +#[cfg(feature = "kiwi")] +use std::convert::TryInto; +use std::fs::{File, OpenOptions}; +use std::iter; +use std::mem; +use std::os::windows::fs::OpenOptionsExt; +use std::sync::Arc; +use sync::Mutex; +use tracing; +#[cfg(feature = "kiwi")] +use vm_control::{ + Ac97Control, BalloonControlCommand, + GpuSendToMain::{self, MuteAc97, SendToService}, + PvClockCommand, PvClockCommandResponse, ServiceSendToGpu, +}; +#[cfg(feature = "gvm")] +use { + devices::GvmIrqChip, + hypervisor::gvm::{Gvm, GvmVcpu, GvmVersion, GvmVm}, +}; + +use vm_control::{VmMemoryRequest, VmRunMode}; +use vm_memory::GuestMemory; +use winapi::um::winnt::FILE_SHARE_READ; +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "haxm"))] +use x86_64::{get_cpu_manufacturer, CpuManufacturer}; + +#[cfg(feature = "gpu")] +use { + crate::crosvm::config::TouchDeviceOption, + base::{BlockingMode, FramingMode, StreamChannel}, + gpu_display::EventDevice, + std::collections::BTreeMap, + std::num::NonZeroU8, +}; +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use { + aarch64::AArch64 as Arch, + devices::{IrqChip, IrqChipAArch64 as IrqChipArch}, + hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch}, +}; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use { + devices::IrqChipX86_64 as IrqChipArch, + hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch}, + x86_64::{adjust_cpuid, CpuIdContext, X8664arch as Arch}, +}; + +#[cfg(feature = "whpx")] +use { + devices::WhpxSplitIrqChip, + hypervisor::whpx::{Whpx, WhpxFeature, WhpxVcpu, WhpxVm}, + hypervisor::Hypervisor, + hypervisor::HypervisorCap, + hypervisor::HypervisorX86_64, + std::arch::x86_64::{__cpuid, __cpuid_count}, +}; + +use crate::crosvm::sys::config::{HypervisorKind, IrqChipKind}; +use broker_ipc::{common_child_setup, CommonChildStartupArgs}; +#[cfg(all(feature = "kiwi", feature = "anti-tamper"))] +use service_ipc::request_utilities::prod::MessageToService; +#[cfg(feature = "kiwi")] +use service_ipc::{ + get_balloon_size, request_utilities::prod::MessageFromService, + service_vm_state::ServiceVmState, ServiceIpc, +}; +use tube_transporter::{TubeToken, TubeTransporterReader}; + +const DEFAULT_GUEST_CID: u64 = 3; + +enum TaggedControlTube { + // TODO: handle vm_control messages as they get added. + #[allow(dead_code)] + Vm(Tube), + VmMemory(Tube), + #[cfg(feature = "kiwi")] + GpuServiceComm(Tube), + #[cfg(feature = "kiwi")] + GpuDeviceServiceComm(Tube), +} + +pub enum ExitState { + Reset, + Stop, + Crash, + #[allow(dead_code)] + GuestPanic, +} + +type DeviceResult = Result; + +fn create_vhost_user_block_device(cfg: &Config, disk_device_tube: Tube) -> DeviceResult { + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::vhost::user::vmm::Block::new(features, disk_device_tube).exit_context( + Exit::VhostUserBlockDeviceNew, + "failed to set up vhost-user block device", + )?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult { + // Lock the disk image to prevent other crosvm instances from using it, unless it is read_only. + let share_flags = if disk.read_only { FILE_SHARE_READ } else { 0 }; + let raw_image: File = OpenOptions::new() + .read(true) + .write(!disk.read_only) + .share_mode(share_flags) + .open(&disk.path) + .with_exit_context(Exit::Disk, || { + format!("failed to load disk image {}", disk.path.display()) + })?; + + let disk_file = + disk::create_disk_file(raw_image, disk.sparse, disk::MAX_NESTING_DEPTH, &disk.path) + .exit_context(Exit::CreateAsyncDisk, "failed to create virtual disk")?; + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::Block::new( + features, + disk_file, + disk.read_only, + disk.sparse, + disk.block_size, + disk.id, + Some(disk_device_tube), + ) + .exit_context(Exit::BlockDeviceNew, "failed to create block device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "gpu")] +fn create_gpu_device( + cfg: &Config, + vm_evt_wrtube: &SendTube, + gpu_device_tube: Tube, + resource_bridges: Vec, + event_devices: Vec, + map_request: Arc>>, + #[cfg(feature = "kiwi")] gpu_device_service_tube: Tube, +) -> DeviceResult { + let gpu_parameters = cfg + .gpu_parameters + .as_ref() + .expect("No GPU parameters provided in config!"); + let display_backends = vec![virtio::DisplayBackend::WinAPI( + (&gpu_parameters.display_params).into(), + )]; + + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::Gpu::new( + vm_evt_wrtube + .try_clone() + .exit_context(Exit::CloneTube, "failed to clone tube")?, + Some(gpu_device_tube), + NonZeroU8::new(1).unwrap(), // number of scanouts + resource_bridges, + display_backends, + gpu_parameters, + event_devices, + map_request, + /* external_blob= */ false, + features, + BTreeMap::new(), + #[cfg(feature = "kiwi")] + Some(gpu_device_service_tube), + ); + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "gpu")] +fn create_multi_touch_device( + cfg: &Config, + multi_touch_spec: &TouchDeviceOption, + event_pipe: StreamChannel, + idx: u32, +) -> DeviceResult { + let (width, height) = multi_touch_spec.get_size(); + let dev = virtio::new_multi_touch( + idx, + event_pipe, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "gpu")] +fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult { + let dev = virtio::new_mouse(idx, event_pipe, virtio::base_features(cfg.protected_vm)) + .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "slirp")] +fn create_net_device( + #[cfg(feature = "slirp-ring-capture")] slirp_capture_file: &Option, +) -> DeviceResult { + let dev = virtio::Net::::new_slirp( + #[cfg(feature = "slirp-ring-capture")] + slirp_capture_file, + ) + .exit_context(Exit::NetDeviceNew, "failed to set up virtio networking")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "slirp")] +fn create_vhost_user_net_device(cfg: &Config, net_device_tube: Tube) -> DeviceResult { + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::vhost::user::vmm::Net::new(features, net_device_tube).exit_context( + Exit::VhostUserNetDeviceNew, + "failed to set up vhost-user net device", + )?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_rng_device(cfg: &Config) -> DeviceResult { + let dev = virtio::Rng::new(virtio::base_features(cfg.protected_vm)) + .exit_context(Exit::RngDeviceNew, "failed to set up rng")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult { + let mut keep_rds = Vec::new(); + let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + let dev = param + .create_serial_device::(cfg.protected_vm, &evt, &mut keep_rds) + .exit_context(Exit::CreateConsole, "failed to create console device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[allow(dead_code)] // TODO(b/234031017): balloon device startup gets stuck on Windows +fn create_balloon_device( + cfg: &Config, + balloon_device_tube: Tube, + dynamic_mapping_device_tube: Tube, + inflate_tube: Option, + init_balloon_size: u64, +) -> DeviceResult { + let dev = virtio::Balloon::new( + virtio::base_features(cfg.protected_vm), + balloon_device_tube, + dynamic_mapping_device_tube, + inflate_tube, + init_balloon_size, + if cfg.strict_balloon { + BalloonMode::Strict + } else { + BalloonMode::Relaxed + }, + ) + .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_vsock_device(cfg: &Config) -> DeviceResult { + // We only support a single guest, so we can confidently assign a default + // CID if one isn't provided. We choose the lowest non-reserved value. + let dev = virtio::Vsock::new( + cfg.cid.unwrap_or(DEFAULT_GUEST_CID), + cfg.host_guid.clone(), + virtio::base_features(cfg.protected_vm), + ) + .exit_context( + Exit::UserspaceVsockDeviceNew, + "failed to create userspace vsock device", + )?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg_attr(not(feature = "gpu"), allow(unused_variables))] +fn create_virtio_devices( + cfg: &mut Config, + vm_evt_wrtube: &SendTube, + gpu_device_tube: Tube, + disk_device_tubes: &mut Vec, + _balloon_device_tube: Option, + pvclock_device_tube: Option, + _dynamic_mapping_device_tube: Option, + _inflate_tube: Option, + _init_balloon_size: u64, + map_request: Arc>>, + #[cfg(feature = "kiwi")] gpu_device_service_tube: Tube, + tsc_frequency: u64, +) -> DeviceResult> { + let mut devs = Vec::new(); + + if cfg.block_vhost_user_tube.is_empty() { + // Disk devices must precede virtio-console devices or the kernel does not boot. + // TODO(b/171215421): figure out why this ordering is required and fix it. + for disk in &cfg.disks { + let disk_device_tube = disk_device_tubes.remove(0); + devs.push(create_block_device(cfg, disk, disk_device_tube)?); + } + } else { + info!("Starting up vhost user block backends..."); + for _disk in &cfg.disks { + let disk_device_tube = cfg.block_vhost_user_tube.remove(0); + devs.push(create_vhost_user_block_device(cfg, disk_device_tube)?); + } + } + + for (_, param) in cfg + .serial_parameters + .iter() + .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole) + { + let dev = create_console_device(cfg, param)?; + devs.push(dev); + } + + if let Some(tube) = pvclock_device_tube { + devs.push(VirtioDeviceStub { + dev: Box::new(PvClock::new(tsc_frequency, tube)), + jail: None, + }); + } + + devs.push(create_rng_device(cfg)?); + + #[cfg(feature = "slirp")] + if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() { + devs.push(create_vhost_user_net_device(cfg, net_vhost_user_tube)?); + } else { + devs.push(create_net_device( + #[cfg(feature = "slirp-ring-capture")] + &cfg.slirp_capture_file, + )?); + } + + // TODO(b/234031017): balloon device startup gets stuck on Windows + //if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) = + // (balloon_device_tube, dynamic_mapping_device_tube) + //{ + // devs.push(create_balloon_device( + // &cfg, + // balloon_device_tube, + // dynamic_mapping_device_tube, + // inflate_tube, + // init_balloon_size, + // )?); + //} + + devs.push(create_vsock_device(&cfg)?); + + #[cfg(feature = "gpu")] + { + let resource_bridges = Vec::::new(); + let mut event_devices: Vec = Vec::new(); + + if !cfg.virtio_single_touch.is_empty() { + unimplemented!("--single-touch is no longer supported. Use --multi-touch instead."); + } + + for (idx, multi_touch_spec) in cfg.virtio_multi_touch.iter().enumerate() { + let (event_device_pipe, virtio_input_pipe) = + StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte) + .exit_context(Exit::EventDeviceSetup, "failed to set up EventDevice")?; + + devs.push(create_multi_touch_device( + cfg, + multi_touch_spec, + virtio_input_pipe, + idx as u32, + )?); + event_devices.push(EventDevice::touchscreen(event_device_pipe)); + } + + for (idx, _mouse_socket) in cfg.virtio_mice.iter().enumerate() { + let (event_device_pipe, virtio_input_pipe) = + StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte) + .exit_context(Exit::EventDeviceSetup, "failed to set up EventDevice")?; + devs.push(create_mouse_device(cfg, virtio_input_pipe, idx as u32)?); + event_devices.push(EventDevice::mouse(event_device_pipe)); + } + + let (event_device_pipe, virtio_input_pipe) = + StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte) + .exit_context(Exit::EventDeviceSetup, "failed to set up EventDevice")?; + + let dev = virtio::new_keyboard( + /* idx= */ 0, + virtio_input_pipe, + virtio::base_features(cfg.protected_vm), + ) + .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; + devs.push(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }); + event_devices.push(EventDevice::keyboard(event_device_pipe)); + + devs.push(create_gpu_device( + cfg, + vm_evt_wrtube, + gpu_device_tube, + resource_bridges, + event_devices, + map_request, + #[cfg(feature = "kiwi")] + gpu_device_service_tube, + )?); + } + + Ok(devs) +} + +fn create_devices( + cfg: &mut Config, + mem: &GuestMemory, + exit_evt_wrtube: &SendTube, + irq_control_tubes: &mut Vec, + gpu_device_tube: Tube, + disk_device_tubes: &mut Vec, + balloon_device_tube: Option, + pvclock_device_tube: Option, + dynamic_mapping_device_tube: Option, + inflate_tube: Option, + init_balloon_size: u64, + map_request: Arc>>, + ac97_device_tubes: Vec, + #[cfg(feature = "kiwi")] gpu_device_service_tube: Tube, + tsc_frequency: u64, +) -> DeviceResult, Option)>> { + let stubs = create_virtio_devices( + cfg, + exit_evt_wrtube, + gpu_device_tube, + disk_device_tubes, + balloon_device_tube, + pvclock_device_tube, + dynamic_mapping_device_tube, + inflate_tube, + init_balloon_size, + map_request, + #[cfg(feature = "kiwi")] + gpu_device_service_tube, + tsc_frequency, + )?; + + let mut pci_devices = Vec::new(); + + for stub in stubs { + let (msi_host_tube, msi_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + irq_control_tubes.push(msi_host_tube); + + let dev = Box::new( + VirtioPciDevice::new( + mem.clone(), + stub.dev, + msi_device_tube, + cfg.disable_virtio_intx, + None, + ) + .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?, + ) as Box; + pci_devices.push((dev, stub.jail)); + } + + if cfg.ac97_parameters.len() != ac97_device_tubes.len() { + panic!( + "{} Ac97 device(s) will be made, but only {} Ac97 device tubes are present.", + cfg.ac97_parameters.len(), + ac97_device_tubes.len() + ); + } + + for (ac97_param, ac97_device_tube) in cfg + .ac97_parameters + .iter() + .zip(ac97_device_tubes.into_iter()) + { + let dev = Ac97Dev::try_new(mem.clone(), ac97_param.clone(), ac97_device_tube) + .exit_context(Exit::CreateAc97, "failed to create ac97 device")?; + pci_devices.push((Box::new(dev), None)); + } + + Ok(pci_devices) +} + +#[cfg(feature = "kiwi")] +fn set_package_name(msg: &MessageFromService) { + match msg { + MessageFromService::HideWindow => { + #[cfg(feature = "crash-report")] + crash_report::set_package_name(""); + + metrics::set_package_name(""); + } + MessageFromService::ShowWindow(ref show) => { + #[cfg(feature = "crash-report")] + crash_report::set_package_name(&show.package_name); + + metrics::set_package_name(&show.package_name); + } + _ => {} + } +} + +#[cfg(feature = "kiwi")] +fn merge_session_invariants(serialized_session_invariants: &[u8]) { + metrics::merge_session_invariants(serialized_session_invariants); +} + +#[derive(Debug)] +struct PvClockError(String); + +/// Sending a pvclock command to the pvclock device can be tricky because we need to wait for a +/// response from the pvclock device if it's running. But, it's possible that the device is not +/// setup yet (or never will be, because the guest doesn't support it). In that case, we want to +/// timeout on recv-ing a response, and to do that we need to do a wait_timeout on the Tube's +/// read_notifier. +#[cfg(feature = "kiwi")] +fn handle_pvclock_request(tube: &Option, command: PvClockCommand) -> Result<()> { + if let Some(ref tube) = tube { + tube.send(&command) + .with_context(|| format!("failed to send pvclock command {:?}", command))?; + + #[derive(EventToken)] + enum Token { + RecvReady, + } + + let wait_ctx = WaitContext::build_with(&[(tube.get_read_notifier(), Token::RecvReady)]) + .context("failed to build pvclock wait context")?; + + let evts = wait_ctx + .wait_timeout(std::time::Duration::from_millis(100)) + .context("failed to wait on pvclock wait context")?; + + ensure!(evts.len() > 0, "timed out waiting for pvclock response"); + + let resp = tube + .recv::() + .context("failed to receive pvclock command response")?; + + if let PvClockCommandResponse::Err(e) = resp { + bail!("pvclock encountered error on {:?}: {}", command, e); + } + } + + Ok(()) +} + +fn run_control( + mut guest_os: RunnableLinuxVm, + sys_allocator: SystemAllocator, + mut control_tubes: Vec, + irq_control_tubes: Vec, + vm_evt_rdtube: RecvTube, + vm_evt_wrtube: SendTube, + broker_shutdown_evt: Option, + balloon_host_tube: Option, + pvclock_host_tube: Option, + map_request: Arc>>, + mut gralloc: RutabagaGralloc, + stats: Option>>, + #[cfg(feature = "kiwi")] service_pipe_name: Option, + ac97_host_tubes: Vec, + memory_size_mb: u64, + host_cpu_topology: bool, + tsc_sync_mitigations: TscSyncMitigations, + force_calibrated_tsc_leaf: bool, +) -> Result { + #[cfg(not(feature = "kiwi"))] + { + // These variable are not used in other configurations. Suppress warnings. + let _ = balloon_host_tube; + let _ = pvclock_host_tube; + let _ = ac97_host_tubes; + let _ = memory_size_mb; + } + + #[derive(EventToken)] + enum Token { + VmEvent, + BrokerShutdown, + VmControl { + index: usize, + }, + #[cfg(feature = "kiwi")] + ServiceIpc, + #[cfg(feature = "proto-tube-hack")] + ProtoIpc, + #[cfg(all(feature = "kiwi", feature = "anti-tamper"))] + AntiTamper, + } + + #[cfg(feature = "kiwi")] + // Note: We use anti_tamper::MAX_CHALLENGE_SIZE because it's the + // largest message passed through the tube. Note the Tube buffer has + // to accomodate the largest message because of b/223807352. + let (ipc_main_loop_tube, ipc_service_ipc_tube) = + Tube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(feature = "proto-tube-hack")] + let (proto_main_loop_tube, proto_service_ipc_tube) = + base::ProtoTube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(feature = "kiwi")] + let _service_ipc = ServiceIpc::start_ipc_listening_loops( + service_pipe_name, + ipc_service_ipc_tube, + #[cfg(feature = "proto-tube-hack")] + proto_service_ipc_tube, + ); + + #[cfg(feature = "kiwi")] + let mut service_vm_state = ServiceVmState::new(); + + let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator)); + + let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + + // Create a separate thread to wait on IRQ events. This is a natural division + // because IRQ interrupts have no dependencies on other events, and this lets + // us avoid approaching the Windows WaitForMultipleObjects 64-object limit. + let irq_join_handle = IrqWaitWorker::start( + exit_evt + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?, + guest_os + .irq_chip + .try_box_clone() + .exit_context(Exit::CloneEvent, "failed to clone irq chip")?, + irq_control_tubes, + sys_allocator_mutex.clone(), + ); + + let wait_ctx = WaitContext::build_with(&[ + (vm_evt_rdtube.get_read_notifier(), Token::VmEvent), + #[cfg(feature = "kiwi")] + (ipc_main_loop_tube.get_read_notifier(), Token::ServiceIpc), + #[cfg(feature = "proto-tube-hack")] + (proto_main_loop_tube.get_read_notifier(), Token::ProtoIpc), + ]) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + if let Some(evt) = broker_shutdown_evt.as_ref() { + wait_ctx.add(evt, Token::BrokerShutdown).exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + + for (index, control_tube) in control_tubes.iter().enumerate() { + #[allow(clippy::single_match)] + match control_tube { + TaggedControlTube::VmMemory(tube) => { + wait_ctx + .add(tube.get_read_notifier(), Token::VmControl { index }) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + #[cfg(feature = "kiwi")] + TaggedControlTube::GpuServiceComm(tube) => { + wait_ctx + .add(tube.get_read_notifier(), Token::VmControl { index }) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + #[cfg(feature = "kiwi")] + TaggedControlTube::GpuDeviceServiceComm(tube) => { + wait_ctx + .add(tube.get_read_notifier(), Token::VmControl { index }) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + // TODO(nkgold): as new control tubes are added, we'll need to add support for them + _ => (), + } + } + + let vcpus: Vec> = match guest_os.vcpus.take() { + Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(), + None => iter::repeat_with(|| None) + .take(guest_os.vcpu_count) + .collect(), + }; + + #[cfg(all( + feature = "kiwi", + feature = "anti-tamper", + not(feature = "proto-tube-hack") + ))] + let (anti_tamper_main_thread_tube, anti_tamper_dedicated_thread_tube) = + Tube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(all(feature = "kiwi", feature = "anti-tamper", feature = "proto-tube-hack"))] + let (anti_tamper_main_thread_tube, anti_tamper_dedicated_thread_tube) = + base::ProtoTube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(all(feature = "kiwi", feature = "anti-tamper",))] + if let Err(_e) = wait_ctx.add( + anti_tamper_main_thread_tube.get_read_notifier(), + Token::AntiTamper, + ) { + #[cfg(debug_assertions)] + error!("Failed to add anti-tamper tube to wait_ctx: {}", _e); + } + + #[cfg(all(feature = "kiwi", feature = "anti-tamper",))] + spawn_dedicated_anti_tamper_thread(anti_tamper_dedicated_thread_tube); + + if sandbox::is_sandbox_target() { + sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "failed to create sandbox")? + .expect("Could not create sandbox!") + .lower_token(); + } + + let vcpu_boxes: Arc>>> = Arc::new(Mutex::new(Vec::new())); + let run_mode_arc = Arc::new(VcpuRunMode::default()); + let vcpu_threads = run_all_vcpus( + vcpus, + vcpu_boxes.clone(), + &guest_os, + &exit_evt, + &vm_evt_wrtube, + &pvclock_host_tube, + &stats, + host_cpu_topology, + run_mode_arc.clone(), + tsc_sync_mitigations, + force_calibrated_tsc_leaf, + )?; + let mut exit_state = ExitState::Stop; + + // TODO: udam b/142733266 (sandboxing) registerwaitforsingleobject to wait on + // child processes when they exit + 'poll: loop { + let events = { + match wait_ctx.wait() { + Ok(v) => v, + Err(e) => { + error!("failed to wait: {}", e); + break; + } + } + }; + + let mut vm_control_indices_to_remove = Vec::new(); + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + Token::VmEvent => match vm_evt_rdtube.recv::() { + Ok(vm_event) => { + match vm_event { + VmEventType::Exit => { + info!("vcpu requested shutdown"); + exit_state = ExitState::Stop; + } + VmEventType::Reset => { + info!("vcpu requested reset"); + exit_state = ExitState::Reset; + } + VmEventType::Crash => { + info!("vcpu crashed"); + exit_state = ExitState::Crash; + } + VmEventType::Panic(_) => { + error!("got pvpanic event. this event is not expected on Windows."); + } + } + break 'poll; + } + Err(e) => { + warn!("failed to recv VmEvent: {}", e); + } + }, + Token::BrokerShutdown => { + info!("main loop got broker shutdown event"); + break 'poll; + } + Token::VmControl { index } => { + if let Some(tube) = control_tubes.get(index) { + #[allow(clippy::single_match)] + match tube { + TaggedControlTube::VmMemory(tube) => { + match tube.recv::() { + Ok(request) => { + let response = request.execute( + &mut guest_os.vm, + &mut sys_allocator_mutex.lock(), + Arc::clone(&map_request), + &mut gralloc, + ); + if let Err(e) = tube.send(&response) { + error!("failed to send VmMemoryControlResponse: {}", e); + } + } + Err(e) => { + if let TubeError::Disconnected = e { + vm_control_indices_to_remove.push(index); + } else { + error!("failed to recv VmMemoryControlRequest: {}", e); + } + } + } + } + #[cfg(feature = "kiwi")] + TaggedControlTube::GpuServiceComm(tube) + | TaggedControlTube::GpuDeviceServiceComm(tube) => { + match tube.recv::() { + Ok(request) => { + #[cfg(feature = "kiwi")] + { + match request { + SendToService(service_request) => { + if let Err(e) = ipc_main_loop_tube.send( + &service_vm_state + .update_gpu_state_and_generate_message_to_service(&service_request), + ) { + error!( + "Failed to send message to ServiceIpc: {}", + e + ); + } + } + MuteAc97(mute) => { + for ac97_host_tube in &ac97_host_tubes { + ac97_host_tube + .send(&Ac97Control::Mute(mute)) + .expect("Could not send mute message!"); + } + service_vm_state.update_audio_state(mute); + if let Err(e) = ipc_main_loop_tube.send( + &service_vm_state + .generate_send_state_message(), + ) { + error!( + "Failed to send message to ServiceIpc: {}", + e + ); + } + + } + } + } + #[cfg(not(feature = "kiwi"))] + { + info!("Dropping message: {:?}", request); + } + } + Err(e) => { + error!( + "Error when receiving message from GpuServiceComm or GpuDeviceServiceComm tube: {}", + e + ); + } + } + } + _ => (), + // TODO: handle vm_control messages. + /* TaggedControlTube::Vm(tube) => match tube.recv::() { + Ok(request) => { + let mut run_mode_opt = None; + let response = request.execute( + &mut run_mode_opt, + disk_host_tubes, + ); + if let Err(e) = tube.send(&response) { + error!("failed to send VmResponse: {}", e); + } + if let Some(run_mode) = run_mode_opt { + info!("control tube changed run mode to {}", run_mode); + match run_mode { + VmRunMode::Exiting => { + break 'poll; + } + } + } + } + Err(e) => { + if let TubeError::Disconnected = e { + vm_control_indices_to_remove.push(index); + } else { + error!("failed to recv VmRequest: {}", e); + } + } + }, */ + } + } + } + #[cfg(feature = "proto-tube-hack")] + Token::ProtoIpc => { + anti_tamper::forward_security_challenge( + &proto_main_loop_tube, + &anti_tamper_main_thread_tube, + ); + } + // For handling service to crosvm messages. At this point, it is up to the dev how + // they want to get the datagram to their component. It's recommended to use + // Tubes if it can't be sent directly. + #[cfg(feature = "kiwi")] + Token::ServiceIpc => match ipc_main_loop_tube.recv::() { + Ok(request) => match request { + MessageFromService::ShowWindow(_) + | MessageFromService::HideWindow + | MessageFromService::Shutdown + | MessageFromService::MouseInputMode(_) => { + set_package_name(&request); + for control_tube in &control_tubes { + if let TaggedControlTube::GpuServiceComm(tube) = &control_tube { + if let Err(e) = + tube.send::(&request.try_into().expect( + "Could not convert to ServiceSendToGpu request!", + )) + { + error!("Failed to send message to GPU display: {}", e); + } + break; + } + } + } + MessageFromService::SetVmMemorySize(balloon_request) => { + info!( + "Service requested balloon adjustment, requested vm size: {}mb", + balloon_request.get_vm_memory_size_mb() + ); + if let Some(ref balloon_host_tube) = balloon_host_tube { + if let Err(e) = + balloon_host_tube.send(&BalloonControlCommand::Adjust { + num_bytes: get_balloon_size( + memory_size_mb, + &balloon_request, + ), + }) + { + error!("Failed to modify balloon size - tube closed: {}", e); + } + } else { + error!("Failed to modify balloon size - balloon disabled"); + } + } + MessageFromService::Suspend => { + info!("Received suspend request from the service"); + // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM. + run_mode_arc.set_and_notify(VmRunMode::Suspending); + + // Force all vcpus to exit from the hypervisor + for vcpu in vcpu_boxes.lock().iter() { + vcpu.set_immediate_exit(true); + } + guest_os.irq_chip.kick_halted_vcpus(); + + handle_pvclock_request(&pvclock_host_tube, PvClockCommand::Suspend) + .unwrap_or_else(|e| { + error!("Error handling pvclock suspend: {:?}", e) + }); + } + MessageFromService::Resume => { + info!("Received resume request from the service"); + handle_pvclock_request(&pvclock_host_tube, PvClockCommand::Resume) + .unwrap_or_else(|e| { + error!("Error handling pvclock resume: {:?}", e) + }); + + // Make sure any immediate exit bits are disabled + for vcpu in vcpu_boxes.lock().iter() { + vcpu.set_immediate_exit(false); + } + + run_mode_arc.set_and_notify(VmRunMode::Running); + } + #[cfg(any(not(feature = "anti-tamper"), feature = "proto-tube-hack"))] + MessageFromService::ReceiveSecurityChallenge(_) => {} + #[cfg(all(feature = "anti-tamper", not(feature = "proto-tube-hack")))] + MessageFromService::ReceiveSecurityChallenge(security_challenge) => { + if let Err(_e) = anti_tamper_main_thread_tube.send(&security_challenge) + { + #[cfg(debug_assertions)] + error!( + "Failed to send challenge program to anti-tamper thread: {}", + _e + ); + } + } + // Receive a mute request when the service receives lock/unlock screen event. The + // mute request should only be received if the window is NOT hidden (the service + // is responsible for that). + MessageFromService::AudioState(set_audio_state_request) => { + for ac97_host_tube in &ac97_host_tubes { + ac97_host_tube + .send(&Ac97Control::Mute(set_audio_state_request.get_is_mute())) + .expect("Could not send mute message!"); + } + service_vm_state + .update_audio_state(set_audio_state_request.get_is_mute()); + + if let Err(e) = ipc_main_loop_tube + .send(&service_vm_state.generate_send_state_message()) + { + error!("Failed to send message to ServiceIpc: {}", e); + } + } + MessageFromService::GetForegroundingPermission( + foregrounding_permission_request, + ) => { + // Perform best-effort, but do not block on failure + // TODO(b/205917759): Move this to gpu process + let mut result = false; + if let Err(e) = give_foregrounding_permission( + foregrounding_permission_request.get_process_id(), + ) { + error!("Failed to give foregrounding permission: {}", e); + } else { + result = true; + } + + if let Err(e) = ipc_main_loop_tube.send( + &MessageToService::SendForegroundingPermissionResult(result.into()), + ) { + // Log, but otherwise ignore failures to send as they are + // handleable and non-fatal. + error!( + "Failed to send foregrounding permission result to the service: {}", + e + ); + } + } + MessageFromService::MergeSessionInvariants(session_invariants_request) => { + let serialized_session_invariants = + session_invariants_request.get_serialized_session_invariants(); + merge_session_invariants(serialized_session_invariants); + } + + MessageFromService::SetAuthToken(set_auth_token_request) => { + metrics::set_auth_token(set_auth_token_request.get_auth_token()); + } + MessageFromService::UploadCrashReport => { + #[cfg(feature = "crash-report")] + crash_report::upload_crash_report("anr"); + + #[cfg(not(feature = "crash-report"))] + info!("Dropping UploadCrashReport message"); + } + MessageFromService::SystemHealthRequest => { + // Reply back with an empty report as there are no system health metrics + // to report yet. + if let Err(e) = + ipc_main_loop_tube.send(&MessageToService::SendSystemHealthReport()) + { + #[cfg(debug_assertions)] + error!("Failed to send system health report to the service: {}", e); + } + } + }, + Err(_e) => {} + }, + #[cfg(all( + feature = "kiwi", + feature = "anti-tamper", + not(feature = "proto-tube-hack") + ))] + Token::AntiTamper => { + match anti_tamper_main_thread_tube.recv::() { + Ok(msg) => { + if let Err(_e) = ipc_main_loop_tube.send(&msg) { + #[cfg(debug_assertions)] + error!("Failed to send anti-tamper signal to the service: {}", _e); + } + } + Err(_e) => { + #[cfg(debug_assertions)] + error!( + "Failed to receive challenge signal from anti-tamper thread: {}", + _e + ); + } + } + } + #[cfg(all(feature = "kiwi", feature = "anti-tamper", feature = "proto-tube-hack"))] + Token::AntiTamper => anti_tamper::forward_security_signal( + &anti_tamper_main_thread_tube, + &ipc_main_loop_tube, + ), + } + } + for event in events.iter().filter(|e| e.is_hungup) { + match event.token { + Token::VmEvent | Token::BrokerShutdown => {} + #[allow(unused_variables)] + Token::VmControl { index } => { + // TODO: handle vm control messages as they get ported. + // It's possible more data is readable and buffered while the tube is hungup, + // so don't delete the tube from the poll context until we're sure all the + // data is read. + /*match control_tubes + .get(index) + .map(|s| s.as_ref().get_readable_bytes()) + { + Some(Ok(0)) | Some(Err(_)) => vm_control_indices_to_remove.push(index), + Some(Ok(x)) => info!("control index {} has {} bytes readable", index, x), + _ => {} + }*/ + } + #[cfg(feature = "proto-tube-hack")] + Token::ProtoIpc => {} + #[cfg(feature = "kiwi")] + Token::ServiceIpc => {} + #[cfg(all(feature = "kiwi", feature = "anti-tamper"))] + Token::AntiTamper => {} + } + } + + // Sort in reverse so the highest indexes are removed first. This removal algorithm + // preserved correct indexes as each element is removed. + //vm_control_indices_to_remove.sort_unstable_by(|a, b| b.cmp(a)); + vm_control_indices_to_remove.dedup(); + for index in vm_control_indices_to_remove { + control_tubes.swap_remove(index); + /*if let Some(tube) = control_tubes.get(index) { + wait_ctx + .modify( + tube, Token::VmControl { index }, + EventType::Read + ) + .exit_context(Exit::WaitContextAdd, "failed to add trigger to wait context")?; + }*/ + } + } + + // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM. + run_mode_arc.set_and_notify(VmRunMode::Exiting); + + // Force all vcpus to exit from the hypervisor + for vcpu in vcpu_boxes.lock().iter() { + vcpu.set_immediate_exit(true); + } + + let mut res = Ok(exit_state); + guest_os.irq_chip.kick_halted_vcpus(); + let _ = exit_evt.write(1); + // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure + // their run loops are aborted. + let _ = vm_evt_wrtube.send::(&VmEventType::Exit); + for (i, thread) in vcpu_threads.into_iter().enumerate() { + // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1. + // otherwise, we will hit a memory leak if we force kill the thread with terminate. + match thread.join() { + Ok(Err(e)) => { + error!("vcpu thread {} exited with an error: {}", i, e); + res = Err(e); + } + Ok(_) => {} + Err(e) => error!("vcpu thread {} panicked: {:?}", i, e), + } + } + + // This cancels all the outstanding and any future blocking operations. + // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a + // cleaner shutdown we have to call disarm so that all the incoming requests are run and are + // cancelled. If we call shutdown all blocking threads will go away and incoming operations + // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call + // shutdown is when we drop non-global executor. + cros_async::unblock_disarm(); + + let _ = irq_join_handle.join(); + + if let Some(stats) = stats { + println!("Statistics Collected:\n{}", stats.lock()); + println!("Statistics JSON:\n{}", stats.lock().json()); + } + + // Explicitly drop the VM structure here to allow the devices to clean up before the + // control tubes are closed when this function exits. + mem::drop(guest_os); + + res +} + +#[cfg(feature = "gvm")] +const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion { + major: 1, + minor: 4, + patch: 1, +}; + +#[cfg(feature = "gvm")] +fn create_gvm(mem: GuestMemory) -> Result { + info!("Creating GVM"); + let gvm = Gvm::new()?; + match gvm.get_full_version() { + Ok(version) => { + if version < GVM_MINIMUM_VERSION { + error!( + "GVM version {} is below minimum version {}", + version, GVM_MINIMUM_VERSION + ); + return Err(base::Error::new(libc::ENXIO).into()); + } else { + info!("Using GVM version {}.", version) + } + } + Err(e) => { + error!("unable to determine gvm version: {}", e); + return Err(base::Error::new(libc::ENXIO).into()); + } + } + let vm = GvmVm::new(&gvm, mem)?; + Ok(vm) +} + +#[cfg(feature = "haxm")] +fn create_haxm(mem: GuestMemory, kernel_log_file: &Option) -> Result { + info!("Creating HAXM ghaxm={}", get_use_ghaxm()); + let haxm = Haxm::new()?; + let vm = HaxmVm::new(&haxm, mem)?; + if let Some(path) = kernel_log_file { + use hypervisor::haxm::HAX_CAP_VM_LOG; + if vm.check_raw_capability(HAX_CAP_VM_LOG) { + match vm.register_log_file(&path) { + Ok(_) => {} + Err(e) => match e.errno() { + libc::E2BIG => { + error!( + "kernel_log_file path is too long, kernel log file will not be written" + ); + } + _ => return Err(e.into()), + }, + } + } else { + warn!( + "kernel_log_file specified but this version of HAXM does not support kernel log \ + files" + ); + } + } + Ok(vm) +} + +#[cfg(feature = "whpx")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn create_whpx( + mem: GuestMemory, + cpu_count: usize, + no_smt: bool, + apic_emulation: bool, + force_calibrated_tsc_leaf: bool, +) -> Result { + info!("Creating Whpx"); + let whpx = Whpx::new()?; + + // context for non-cpu-specific cpuid results + let ctx = CpuIdContext::new( + 0, + cpu_count, + no_smt, + /*host_cpu_topology=*/ false, + None, + /* enable_pnp_data */ false, + /* itmt */ false, + force_calibrated_tsc_leaf, + whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired), + __cpuid_count, + __cpuid, + ); + + // Get all cpuid entries that we should pre-set + let mut cpuid = whpx.get_supported_cpuid()?; + + // Adjust them for crosvm + for entry in cpuid.cpu_id_entries.iter_mut() { + adjust_cpuid(entry, &ctx); + } + + let vm = WhpxVm::new(&whpx, cpu_count, mem, cpuid, apic_emulation) + .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?; + + Ok(vm) +} + +#[cfg(feature = "gvm")] +fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result { + info!("Creating GVM irqchip"); + let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?; + Ok(irq_chip) +} + +#[cfg(feature = "whpx")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn create_whpx_split_irq_chip( + vm: &WhpxVm, + ioapic_device_tube: Tube, +) -> base::Result { + info!("Creating WHPX split irqchip"); + WhpxSplitIrqChip::new( + vm.try_clone()?, + ioapic_device_tube, + None, // ioapic_pins + ) +} + +fn create_userspace_irq_chip( + vcpu_count: usize, + ioapic_device_tube: Tube, +) -> base::Result> +where + Vm: VmArch + 'static, + Vcpu: VcpuArch + 'static, +{ + info!("Creating userspace irqchip"); + let irq_chip = + UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /*ioapic_pins:*/ None)?; + Ok(irq_chip) +} + +pub fn get_default_hypervisor() -> Result { + // The ordering here matters from most preferable to the least. + #[cfg(feature = "whpx")] + match hypervisor::whpx::Whpx::is_enabled() { + true => return Ok(HypervisorKind::Whpx), + false => warn!("Whpx not enabled."), + }; + #[cfg(feature = "haxm")] + if get_cpu_manufacturer() == CpuManufacturer::Intel { + // Make sure Haxm device can be opened before selecting it. + match Haxm::new() { + Ok(_) => return Ok(HypervisorKind::Ghaxm), + Err(e) => warn!("Cannot initialize HAXM: {}", e), + }; + } + #[cfg(feature = "gvm")] + // Make sure Gvm device can be opened before selecting it. + match Gvm::new() { + Ok(_) => return Ok(HypervisorKind::Gvm), + Err(e) => warn!("Cannot initialize GVM: {}", e), + }; + bail!("no hypervisor enabled!"); +} + +fn setup_vm_components(cfg: &Config) -> Result { + let initrd_image = if let Some(initrd_path) = &cfg.initrd_path { + Some( + File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || { + format!("failed to open initrd {}", initrd_path.display()) + })?, + ) + } else { + None + }; + + let vm_image = match cfg.executable_path { + Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel( + File::open(kernel_path).with_exit_context(Exit::OpenKernel, || { + format!("failed to open kernel image {}", kernel_path.display(),) + })?, + ), + Some(Executable::Bios(ref bios_path)) => { + VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || { + format!("failed to open bios {}", bios_path.display()) + })?) + } + _ => panic!("Did not receive a bios or kernel, should be impossible."), + }; + + let swiotlb = if let Some(size) = cfg.swiotlb { + Some( + size.checked_mul(1024 * 1024) + .ok_or_else(|| anyhow!("requested swiotlb size too large"))?, + ) + } else { + match cfg.protected_vm { + ProtectionType::Protected | ProtectionType::ProtectedWithoutFirmware => { + Some(64 * 1024 * 1024) + } + ProtectionType::Unprotected | ProtectionType::UnprotectedWithFirmware => None, + } + }; + + Ok(VmComponents { + memory_size: cfg + .memory + .unwrap_or(256) + .checked_mul(1024 * 1024) + .ok_or_else(|| anyhow!("requested memory size too large"))?, + swiotlb, + vcpu_count: cfg.vcpu_count.unwrap_or(1), + vcpu_affinity: cfg.vcpu_affinity.clone(), + cpu_clusters: cfg.cpu_clusters.clone(), + cpu_capacity: cfg.cpu_capacity.clone(), + no_smt: cfg.no_smt, + hugepages: cfg.hugepages, + vm_image, + android_fstab: cfg + .android_fstab + .as_ref() + .map(|x| { + File::open(x).with_exit_context(Exit::OpenAndroidFstab, || { + format!("failed to open android fstab file {}", x.display()) + }) + }) + .map_or(Ok(None), |v| v.map(Some))?, + pstore: cfg.pstore.clone(), + initrd_image, + extra_kernel_params: cfg.params.clone(), + acpi_sdts: cfg + .acpi_tables + .iter() + .map(|path| { + SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || { + format!("failed to open ACPI file {}", path.display()) + }) + }) + .collect::>>()?, + rt_cpus: cfg.rt_cpus.clone(), + delay_rt: cfg.delay_rt, + protected_vm: cfg.protected_vm, + dmi_path: cfg.dmi_path.clone(), + no_i8042: cfg.no_i8042, + no_rtc: cfg.no_rtc, + host_cpu_topology: cfg.host_cpu_topology, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + force_s2idle: cfg.force_s2idle, + itmt: false, + pvm_fw: None, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pci_low_start: cfg.pci_low_start, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pcie_ecam: cfg.pcie_ecam, + }) +} + +// Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch. +enum WindowsIrqChip { + Userspace(UserspaceIrqChip), + #[cfg(feature = "gvm")] + Gvm(GvmIrqChip), + #[cfg(feature = "whpx")] + WhpxSplit(WhpxSplitIrqChip), +} + +impl WindowsIrqChip { + // Convert our enum to a &mut dyn IrqChipArch + fn as_mut(&mut self) -> &mut dyn IrqChipArch { + match self { + WindowsIrqChip::Userspace(i) => i, + #[cfg(feature = "gvm")] + WindowsIrqChip::Gvm(i) => i, + #[cfg(feature = "whpx")] + WindowsIrqChip::WhpxSplit(i) => i, + } + } +} + +/// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will +/// need access to it when tracing is enabled. +static TSC_OFFSETS: once_cell::sync::Lazy>>> = + once_cell::sync::Lazy::new(|| sync::Mutex::new(Vec::new())); + +/// Save the TSC offset for a particular vcpu. +/// +/// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets +/// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus +/// it can cause clock issues in the guest. +pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) { + let offsets_copy = { + let mut offsets = TSC_OFFSETS.lock(); + // make sure offsets vec is large enough before inserting + let newlen = std::cmp::max(offsets.len(), vcpu_id + 1); + offsets.resize(newlen, None); + offsets[vcpu_id] = Some(offset); + + offsets.clone() + }; + + // do statistics on a clone of the offsets so we don't hold up other vcpus at this point + info!( + "TSC offset standard deviation is: {}", + standard_deviation( + &offsets_copy + .iter() + .filter(|x| x.is_some()) + .map(|x| x.unwrap() as u128) + .collect::>() + ) + ); +} + +/// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS. +#[cfg(feature = "cperfetto")] +pub fn get_vcpu_tsc_offset() -> u64 { + for offset in TSC_OFFSETS.lock().iter() { + if let Some(offset) = offset { + return *offset; + } + } + 0 +} + +/// Callback that is registered with tracing crate, and will be called by the tracing thread when +/// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for +/// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the +/// host TSC. Redundant snapshots should not be a problem for perfetto. +#[cfg(feature = "cperfetto")] +fn set_tsc_clock_snapshot() { + let freq = match devices::tsc_frequency() { + Err(e) => { + error!( + "Could not determine tsc frequency, unable to snapshot tsc offset: {}", + e + ); + return; + } + Ok(freq) => freq, + }; + + // The offset is host-guest tsc value + let offset = get_vcpu_tsc_offset(); + // Safe because _rdtsc takes no arguments; + let host_tsc = unsafe { std::arch::x86_64::_rdtsc() }; + perfetto::snapshot_clock(perfetto::ClockSnapshot::new( + // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't + // support floating point multipliers yet. So for now we set the freq in Hz and rely + // on the merge tool to fix it. + perfetto::Clock::new( + perfetto::BuiltinClock::Tsc as u32, + host_tsc.wrapping_add(offset), + ) + .set_multiplier(freq as u64), + perfetto::Clock::new( + // The host builtin clock ids are all offset from the guest ids by + // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot + // contains both a guest and host clock, we need to offset it before merge. + perfetto::BuiltinClock::Tsc as u32 + tracing::HOST_GUEST_CLOCK_ID_OFFSET, + host_tsc, + ) + .set_multiplier(freq as u64), + )); +} + +/// Launches run_config for the broker, reading configuration from a TubeTransporter. +pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result { + // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that + // the blocking & framing modes are accurate because we create them ourselves in the broker. + let tube_transporter = + unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) }; + + let mut tube_data_list = tube_transporter + .read_tubes() + .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?; + + let bootstrap_tube = tube_data_list + .get_tube(TubeToken::Bootstrap) + .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?; + + let mut cfg: Config = bootstrap_tube + .recv::() + .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; + + let startup_args: CommonChildStartupArgs = bootstrap_tube + .recv::() + .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; + let _child_cleanup = common_child_setup(startup_args).exit_context( + Exit::CommonChildSetupError, + "failed to perform common child setup", + )?; + + cfg.broker_shutdown_event = Some( + bootstrap_tube + .recv::() + .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?, + ); + + run_config_inner(cfg) +} + +pub fn run_config(cfg: Config) -> Result { + let _raise_timer_resolution = enable_high_res_timers() + .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?; + run_config_inner(cfg) +} + +fn run_config_inner(cfg: Config) -> Result { + #[cfg(feature = "kiwi")] + { + let use_vulkan = if cfg!(feature = "gpu") { + match &cfg.gpu_parameters { + Some(params) => Some(params.use_vulkan), + None => None, + } + } else { + None + }; + anti_tamper::setup_common_metric_invariants( + &&cfg.product_version, + &cfg.product_channel, + &use_vulkan, + ); + } + + tracing::init(); + #[cfg(feature = "cperfetto")] + tracing::add_per_trace_callback(set_tsc_clock_snapshot); + + let components: VmComponents = setup_vm_components(&cfg)?; + + let guest_mem_layout = Arch::guest_memory_layout(&components).exit_context( + Exit::GuestMemoryLayout, + "failed to create guest memory layout", + )?; + let guest_mem = GuestMemory::new(&guest_mem_layout) + .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")?; + + let default_hypervisor = get_default_hypervisor() + .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?; + #[allow(unused_mut)] + let mut hypervisor = cfg.hypervisor.unwrap_or(default_hypervisor); + + #[cfg(feature = "whpx")] + if hypervisor::whpx::Whpx::is_enabled() { + // If WHPX is enabled, no other hypervisor can be used, so just override it + hypervisor = HypervisorKind::Whpx; + } + + match hypervisor { + #[cfg(feature = "haxm")] + HypervisorKind::Haxm | HypervisorKind::Ghaxm => { + if hypervisor == HypervisorKind::Haxm { + set_use_ghaxm(false); + } + let vm = create_haxm(guest_mem, &cfg.kernel_log_file)?; + let (ioapic_host_tube, ioapic_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + let irq_chip = create_userspace_irq_chip::( + components.vcpu_count, + ioapic_device_tube, + )?; + run_vm::( + cfg, + components, + vm, + WindowsIrqChip::Userspace(irq_chip).as_mut(), + Some(ioapic_host_tube), + ) + } + #[cfg(feature = "whpx")] + HypervisorKind::Whpx => { + let apic_emulation_supported = + Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation) + .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?; + + let no_smt = cfg.no_smt; + + // Default to WhpxSplitIrqChip if it's supported because it's more performant + let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported { + IrqChipKind::Split + } else { + IrqChipKind::Userspace + }); + + // Both WHPX irq chips use a userspace IOAPIC + let (ioapic_host_tube, ioapic_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + let vm = create_whpx( + guest_mem, + components.vcpu_count, + no_smt, + apic_emulation_supported && irq_chip == IrqChipKind::Split, + cfg.force_calibrated_tsc_leaf, + )?; + + let mut irq_chip = match irq_chip { + IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"), + IrqChipKind::Split => { + if !apic_emulation_supported { + panic!( + "split irqchip specified but your WHPX version does not support \ + local apic emulation" + ); + } + WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?) + } + IrqChipKind::Userspace => { + WindowsIrqChip::Userspace(create_userspace_irq_chip::( + components.vcpu_count, + ioapic_device_tube, + )?) + } + }; + run_vm::( + cfg, + components, + vm, + irq_chip.as_mut(), + Some(ioapic_host_tube), + ) + } + #[cfg(feature = "gvm")] + HypervisorKind::Gvm => { + let vm = create_gvm(guest_mem)?; + let ioapic_host_tube; + let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) { + IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"), + IrqChipKind::Kernel => { + ioapic_host_tube = None; + WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?) + } + IrqChipKind::Userspace => { + let (host_tube, ioapic_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + ioapic_host_tube = Some(host_tube); + WindowsIrqChip::Userspace(create_userspace_irq_chip::( + components.vcpu_count, + ioapic_device_tube, + )?) + } + }; + run_vm::(cfg, components, vm, irq_chip.as_mut(), ioapic_host_tube) + } + } +} + +fn run_vm( + #[allow(unused_mut)] mut cfg: Config, + #[allow(unused_mut)] mut components: VmComponents, + mut vm: V, + irq_chip: &mut dyn IrqChipArch, + ioapic_host_tube: Option, +) -> Result +where + Vcpu: VcpuArch + 'static, + V: VmArch + 'static, +{ + let vm_memory_size_mb = components.memory_size / (1024 * 1024); + let mut control_tubes = Vec::new(); + let mut irq_control_tubes = Vec::new(); + // Create one control tube per disk. + let mut disk_device_tubes = Vec::new(); + let mut disk_host_tubes = Vec::new(); + let disk_count = cfg.disks.len(); + for _ in 0..disk_count { + let (disk_host_tube, disk_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + disk_host_tubes.push(disk_host_tube); + disk_device_tubes.push(disk_device_tube); + } + let (gpu_host_tube, gpu_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(gpu_host_tube)); + + if let Some(ioapic_host_tube) = ioapic_host_tube { + irq_control_tubes.push(ioapic_host_tube); + } + + // Balloon gets a special socket so balloon requests can be forwarded from the main process. + let (balloon_host_tube, balloon_device_tube) = if cfg.balloon { + let (balloon_host_tube, balloon_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + (Some(balloon_host_tube), Some(balloon_device_tube)) + } else { + (None, None) + }; + // The balloon device also needs a tube to communicate back to the main process to + // handle remapping memory dynamically. + let dynamic_mapping_device_tube = if cfg.balloon { + let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(dynamic_mapping_host_tube)); + Some(dynamic_mapping_device_tube) + } else { + None + }; + + // PvClock gets a tube for handling suspend/resume requests from the main thread. + let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock { + let (host, device) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + (Some(host), Some(device)) + } else { + (None, None) + }; + + #[cfg(feature = "kiwi")] + { + if cfg.service_pipe_name.is_some() { + let (gpu_main_host_tube, gpu_main_display_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::GpuServiceComm(gpu_main_host_tube)); + let mut gpu_parameters = cfg + .gpu_parameters + .as_mut() + .expect("missing GpuParameters in config"); + gpu_parameters.display_params.gpu_main_display_tube = + Some(Arc::new(Mutex::new(gpu_main_display_tube))); + } + }; + + // Create a ServiceComm tube to pass to the gpu device + #[cfg(feature = "kiwi")] + let gpu_device_service_tube = { + let (gpu_device_service_tube, gpu_device_service_host_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::GpuDeviceServiceComm( + gpu_device_service_host_tube, + )); + gpu_device_service_tube + }; + + let gralloc = + RutabagaGralloc::new().exit_context(Exit::CreateGralloc, "failed to create gralloc")?; + let map_request: Arc>> = Arc::new(Mutex::new(None)); + + let (vm_evt_wrtube, vm_evt_rdtube) = + Tube::directional_pair().context("failed to create vm event tube")?; + let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64); + let mut sys_allocator = SystemAllocator::new( + Arch::get_system_allocator_config(&vm), + pstore_size, + &cfg.mmio_address_ranges, + ) + .context("failed to create system allocator")?; + + let mut ac97_host_tubes = Vec::new(); + let mut ac97_device_tubes = Vec::new(); + for _ in &cfg.ac97_parameters { + let (ac97_host_tube, ac97_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + ac97_host_tubes.push(ac97_host_tube); + ac97_device_tubes.push(ac97_device_tube); + } + + // Allocate the ramoops region first. + let ramoops_region = match &components.pstore { + Some(pstore) => Some( + arch::pstore::create_memory_region( + &mut vm, + sys_allocator.reserved_region().unwrap(), + &pstore, + ) + .exit_context(Exit::Pstore, "failed to allocate pstore region")?, + ), + None => None, + }; + + let init_balloon_size = components + .memory_size + .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| { + m.checked_mul(1024 * 1024).unwrap_or(u64::MAX) + })) + .context("failed to calculate init balloon size")?; + + let tsc_state = devices::tsc_state().exit_code(Exit::TscCalibrationFailed)?; + let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count); + + if tsc_state.core_grouping.size() > 1 { + // Host TSCs are not in sync, log a metric about it. + warn!( + "Host TSCs are not in sync, applying the following mitigations: {:?}", + tsc_sync_mitigations + ); + log_descriptor( + MetricEventType::TscCoresOutOfSync, + // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask + tsc_state.core_grouping.core_grouping_bitmask() as i64, + ); + } + + let pci_devices = create_devices( + &mut cfg, + vm.get_memory(), + &vm_evt_wrtube, + &mut irq_control_tubes, + gpu_device_tube, + &mut disk_device_tubes, + balloon_device_tube, + pvclock_device_tube, + dynamic_mapping_device_tube, + /* inflate_tube= */ None, + init_balloon_size, + Arc::clone(&map_request), + ac97_host_tubes, + #[cfg(feature = "kiwi")] + gpu_device_service_tube, + tsc_state.frequency, + )?; + + let mut vcpu_ids = Vec::new(); + + let windows = Arch::build_vm::( + components, + &vm_evt_wrtube, + &mut sys_allocator, + &cfg.serial_parameters, + None, + (&cfg.battery_type, None), + vm, + ramoops_region, + pci_devices, + irq_chip, + &mut vcpu_ids, + /*debugcon_jail=*/ None, + ) + .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?; + + let _render_node_host = (); + + let stats = if cfg.exit_stats { + Some(Arc::new(Mutex::new(StatisticsCollector::new()))) + } else { + None + }; + + run_control( + windows, + sys_allocator, + control_tubes, + irq_control_tubes, + vm_evt_rdtube, + vm_evt_wrtube, + cfg.broker_shutdown_event.take(), + balloon_host_tube, + pvclock_host_tube, + Arc::clone(&map_request), + gralloc, + stats, + #[cfg(feature = "kiwi")] + cfg.service_pipe_name, + ac97_device_tubes, + vm_memory_size_mb, + cfg.host_cpu_topology, + tsc_sync_mitigations, + cfg.force_calibrated_tsc_leaf, + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn create_config(test_dir: &TempDir) -> Config { + let mut config = Config::default(); + + let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt"); + OpenOptions::new() + .create(true) + .write(true) + .open(&dummy_kernel_path) + .expect("Could not open file!"); + config.executable_path = Some(Executable::Kernel(dummy_kernel_path)); + + config + } + + #[test] + #[should_panic(expected = "Did not receive a bios or kernel")] + fn setup_vm_components_panics_when_no_kernel_provided() { + let mut config = + create_config(&TempDir::new().expect("Could not create temporary directory!")); + config.executable_path = None; + let _ = setup_vm_components(&config); + } + + #[test] + fn setup_vm_components_stores_memory_in_bytes() { + let tempdir = TempDir::new().expect("Could not create temporary directory!"); + let mut config = create_config(&tempdir); + config.memory = Some(1); + let vm_components = setup_vm_components(&config).expect("failed to setup vm components"); + assert_eq!(vm_components.memory_size, 1024 * 1024); + } + + #[test] + fn setup_vm_components_fails_when_memory_too_large() { + let tempdir = TempDir::new().expect("Could not create temporary directory!"); + let mut config = create_config(&tempdir); + // One mb more than a u64 can hold in bytes + config.memory = Some((u64::MAX / 1024 / 1024) + 1); + setup_vm_components(&config).err().expect("expected error"); + } +} diff --git a/src/sys/windows/irq_wait.rs b/src/sys/windows/irq_wait.rs new file mode 100644 index 0000000000..8fe773c300 --- /dev/null +++ b/src/sys/windows/irq_wait.rs @@ -0,0 +1,364 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! Handles the main wait loop for IRQs. +//! Should be started on a background thread. + +use base::{ + error, info, warn, Event, EventToken, ReadNotifier, Result, Tube, TubeError, WaitContext, + MAXIMUM_WAIT_OBJECTS, +}; +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use devices::IrqChipAArch64 as IrqChipArch; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use devices::IrqChipX86_64 as IrqChipArch; +use devices::{IrqEdgeEvent, IrqEventIndex, IrqEventSource}; +use metrics::{log_high_frequency_descriptor_event, MetricEventType}; +use resources::SystemAllocator; +use std::collections::HashMap; +use std::sync::Arc; +use std::thread::{self, JoinHandle}; +use std::time::{Duration, Instant}; +use sync::Mutex; +use vm_control::{IrqSetup, VmIrqRequest}; + +pub struct IrqWaitWorker { + exit_evt: Event, + irq_chip: Box, + irq_control_tubes: Vec, + sys_allocator: Arc>, +} + +impl IrqWaitWorker { + pub fn start( + exit_evt: Event, + irq_chip: Box, + irq_control_tubes: Vec, + sys_allocator: Arc>, + ) -> JoinHandle> { + let mut irq_worker = IrqWaitWorker { + exit_evt, + irq_chip, + irq_control_tubes, + sys_allocator, + }; + thread::Builder::new() + .name("irq_wait_loop".into()) + .spawn(move || irq_worker.run()) + .unwrap() + } + + fn run(&mut self) -> Result<()> { + #[derive(EventToken)] + enum Token { + Exit, + VmControl { index: usize }, + DelayedIrqEvent, + } + + let wait_ctx = WaitContext::build_with(&[(&self.exit_evt, Token::Exit)])?; + + let mut max_event_index: usize = 0; + let mut vm_control_added_irq_events: Vec = Vec::new(); + let mut irq_event_sources: HashMap = HashMap::new(); + // TODO(b/190828888): Move irq logging into the irqchip impls. + let irq_frequencies = Arc::new(Mutex::new(vec![0; max_event_index + 1])); + let irq_events = self.irq_chip.irq_event_tokens()?; + let mut children = vec![]; + + let (mut child_wait_ctx, child_join_handle) = IrqWaitWorkerChild::start( + self.exit_evt.try_clone()?, + self.irq_chip.try_box_clone()?, + irq_frequencies.clone(), + )?; + children.push(child_join_handle); + + for (event_index, source, evt) in irq_events { + child_wait_ctx.add(&evt, ChildToken::IrqEvent { event_index })?; + max_event_index = std::cmp::max(max_event_index, event_index); + irq_event_sources.insert(event_index, source); + + vm_control_added_irq_events.push(evt); + } + + irq_frequencies.lock().resize(max_event_index + 1, 0); + + for (index, control_tube) in self.irq_control_tubes.iter().enumerate() { + wait_ctx.add(control_tube.get_read_notifier(), Token::VmControl { index })?; + } + + let mut _delayed_event_token: Option = None; + if let Some(delayed_token) = self.irq_chip.irq_delayed_event_token()? { + wait_ctx.add(&delayed_token, Token::DelayedIrqEvent)?; + // store the token, so that it lasts outside this scope. + // We must store the event as try_clone creates a new event. It won't keep + // the current event valid that is waited on inside wait_ctx. + _delayed_event_token = Some(delayed_token); + } + + let mut intr_stat_sample_time = Instant::now(); + + 'poll: loop { + let events = { + match wait_ctx.wait() { + Ok(v) => v, + Err(e) => { + error!("failed to wait on irq thread: {}", e); + break 'poll; + } + } + }; + + let mut vm_control_indices_to_remove = Vec::new(); + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + Token::Exit => { + info!("irq event loop got exit event"); + break 'poll; + } + Token::VmControl { index } => { + if let Some(tube) = self.irq_control_tubes.get(index) { + match tube.recv::() { + Ok(request) => { + let response = { + let irq_chip = &mut self.irq_chip; + let exit_evt = &self.exit_evt; + // TODO(b/229262201): Refactor the closure into a standalone function to reduce indentation. + request.execute( + |setup| match setup { + IrqSetup::Event( + irq, + ev, + device_id, + queue_id, + device_name, + ) => { + let irqevent = IrqEdgeEvent::from_event( + ev.try_clone() + .expect("Failed to clone irq event."), + ); + let source = IrqEventSource { + device_id: device_id.try_into()?, + queue_id, + device_name, + }; + let event_index = irq_chip + .register_edge_irq_event( + irq, + &irqevent, + source.clone(), + )?; + if let Some(event_index) = event_index { + max_event_index = std::cmp::max( + event_index, + irq as usize, + ); + irq_frequencies + .lock() + .resize(max_event_index + 1, 0); + irq_event_sources + .insert(event_index, source); + // Make new thread if needed, including buffer space for any + // events we didn't explicitly add (exit/reset/etc) + if irq_event_sources.len() + % (MAXIMUM_WAIT_OBJECTS - 3) + == 0 + { + // The child wait thread has reached max capacity, we + // need to add another. + let (new_wait_ctx, child_join_handle) = + IrqWaitWorkerChild::start( + exit_evt.try_clone()?, + irq_chip.try_box_clone()?, + irq_frequencies.clone(), + )?; + child_wait_ctx = new_wait_ctx; + children.push(child_join_handle); + } + let irqevent = + irqevent.get_trigger().try_clone()?; + match child_wait_ctx.add( + &irqevent, + ChildToken::IrqEvent { event_index }, + ) { + Err(e) => { + warn!("failed to add IrqEvent to synchronization context: {}", e); + Err(e) + }, + Ok(_) => { + vm_control_added_irq_events + .push(irqevent); + Ok(()) + } + } + } else { + Ok(()) + } + } + IrqSetup::Route(route) => irq_chip.route_irq(route), + IrqSetup::UnRegister(irq, ev) => irq_chip + .unregister_edge_irq_event( + irq, + &IrqEdgeEvent::from_event(ev.try_clone()?), + ), + }, + &mut self.sys_allocator.lock(), + ) + }; + if let Err(e) = tube.send(&response) { + error!("failed to send VmIrqResponse: {}", e); + } + } + Err(e) => { + if let TubeError::Disconnected = e { + vm_control_indices_to_remove.push(index); + } else { + error!("failed to recv VmIrqRequest: {}", e); + } + } + } + } + } + Token::DelayedIrqEvent => { + if let Err(e) = self.irq_chip.process_delayed_irq_events() { + warn!("can't deliver delayed irqs: {}", e); + } + } + } + } + + let now = Instant::now(); + let intr_stat_duration = now.duration_since(intr_stat_sample_time); + + // include interrupt stats every 10 seconds + if intr_stat_duration > Duration::from_secs(10) { + let mut event_indices: Vec<(&usize, &IrqEventSource)> = + irq_event_sources.iter().collect(); + // sort the devices by irq_frequency + let mut locked_irq_frequencies = irq_frequencies.lock(); + event_indices + .sort_by_key(|(idx, _)| std::cmp::Reverse(locked_irq_frequencies[**idx])); + let rates: Vec = event_indices + .iter() + .filter(|(idx, _)| locked_irq_frequencies[**idx] > 0) + .map(|(idx, source)| { + let rate = locked_irq_frequencies[**idx] / intr_stat_duration.as_secs(); + // As the descriptor, use a 64bit int containing two 32bit ids. + // low bits: queue_id, high bits: device_id + let descriptor_bytes: [u8; 8] = { + let mut bytes: [u8; 8] = [0; 8]; + for (i, byte) in + (source.queue_id as u32).to_le_bytes().iter().enumerate() + { + bytes[i] = *byte + } + let device_id: u32 = source.device_id.into(); + for (i, byte) in device_id.to_le_bytes().iter().enumerate() { + bytes[i + 4] = *byte + } + bytes + }; + log_high_frequency_descriptor_event( + MetricEventType::Interrupts, + i64::from_le_bytes(descriptor_bytes), + rate as i64, + ); + format!("{}({})->{}/s", source.device_name, source.queue_id, rate,) + }) + .collect(); + + info!("crosvm-interrupt-rates: {}", rates.join(", ")); + + // reset sample time and counters + intr_stat_sample_time = now; + *locked_irq_frequencies = vec![0; max_event_index + 1]; + } + + vm_control_indices_to_remove.dedup(); + for index in vm_control_indices_to_remove { + self.irq_control_tubes.swap_remove(index); + } + } + + // Ensure all children have ended by firing off the exit event again to make sure the loop + // is exited, and joining to ensure none are hanging. + let _ = self.exit_evt.write(1); + for child in children { + match child.join() { + Ok(Err(e)) => warn!("IRQ woker child ended in error: {}", e), + Err(e) => warn!("IRQ worker child panicked with error: {:?}", e), + _ => {} + } + } + + Ok(()) + } +} + +#[derive(EventToken)] +enum ChildToken { + Exit, + IrqEvent { event_index: IrqEventIndex }, +} +/// An arbitrarily expandible worker for waiting on irq events. +/// This worker is responsible for hadling the irq events, whereas +/// the parent worker's job is just to handle the irq control tube requests. +struct IrqWaitWorkerChild { + wait_ctx: Arc>, + exit_evt: Event, + irq_chip: Box, + irq_frequencies: Arc>>, +} + +impl IrqWaitWorkerChild { + fn start( + exit_evt: Event, + irq_chip: Box, + irq_frequencies: Arc>>, + ) -> Result<(Arc>, JoinHandle>)> { + let child_wait_ctx = Arc::new(WaitContext::new()?); + let mut child = IrqWaitWorkerChild { + wait_ctx: child_wait_ctx.clone(), + exit_evt, + irq_chip, + irq_frequencies, + }; + let join_handle = thread::Builder::new() + .name("irq_child_wait_loop".into()) + .spawn(move || child.run())?; + + Ok((child_wait_ctx, join_handle)) + } + + fn run(&mut self) -> Result<()> { + self.wait_ctx.add(&self.exit_evt, ChildToken::Exit)?; + 'poll: loop { + let events = { + match self.wait_ctx.wait() { + Ok(v) => v, + Err(e) => { + error!("failed to wait on irq child thread: {}", e); + break 'poll; + } + } + }; + + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + ChildToken::Exit => { + info!("irq child event loop got exit event"); + break 'poll; + } + ChildToken::IrqEvent { event_index } => { + self.irq_frequencies.lock()[event_index] += 1; + if let Err(e) = self.irq_chip.service_irq_event(event_index) { + error!("failed to signal irq {}: {}", event_index, e); + } + } + } + } + } + Ok(()) + } +} diff --git a/src/sys/windows/main.rs b/src/sys/windows/main.rs new file mode 100644 index 0000000000..9fba29c304 --- /dev/null +++ b/src/sys/windows/main.rs @@ -0,0 +1,247 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use anyhow::{anyhow, Result}; +use argh::FromArgs; +use base::{ + info, + syslog::{self, LogConfig}, + FromRawDescriptor, RawDescriptor, +}; +use broker_ipc::{common_child_setup, CommonChildStartupArgs}; +use metrics::{ + self, + event_details_proto::{EmulatorDllDetails, RecordDetails}, + MetricEventType, +}; +#[cfg(all(feature = "slirp"))] +use net_util::slirp::sys::windows::SlirpStartupConfig; +use tube_transporter::{TubeToken, TubeTransporterReader}; +use win_util::{DllNotificationData, DllWatcher}; + +use std::collections::HashSet; +use std::ffi::OsString; +use std::fs::OpenOptions; + +use crate::{ + crosvm::{ + argument::{self, Argument}, + cmdline::RunCommand, + sys::cmdline::{Commands, DevicesSubcommand}, + sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}, + }, + metrics::run_metrics, + CommandStatus, Config, +}; + +#[cfg(all(feature = "slirp"))] +pub(crate) fn run_slirp(args: Vec) -> Result<()> { + let arguments = &[Argument::value( + "bootstrap", + "TRANSPORT_TUBE_RD", + "TubeTransporter descriptor used to bootstrap the Slirp process.", + )]; + + let raw_transport_tube = set_bootstrap_arguments(args, arguments) + .exit_context(Exit::InvalidSubCommandArgs, "error in setting slirp args")?; + + // Safe because we know that raw_transport_tube is valid (passed by inheritance), + // and that the blocking & framing modes are accurate because we create them ourselves + // in the broker. + let tube_transporter = + unsafe { TubeTransporterReader::from_raw_descriptor(raw_transport_tube.unwrap()) }; + + let mut tube_data_list = tube_transporter + .read_tubes() + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + let bootstrap_tube = tube_data_list.get_tube(TubeToken::Bootstrap).unwrap(); + + let startup_args: CommonChildStartupArgs = + bootstrap_tube.recv::().unwrap(); + let _child_cleanup = common_child_setup(startup_args).exit_context( + Exit::CommonChildSetupError, + "failed to perform common child setup", + )?; + + let slirp_config = bootstrap_tube.recv::().unwrap(); + + if let Some(mut target) = sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")? + { + target.lower_token(); + } + + net_util::Slirp::run_slirp_process( + slirp_config.slirp_pipe, + slirp_config.shutdown_event, + #[cfg(feature = "slirp-ring-capture")] + slirp_config.slirp_capture_file, + ); + Ok(()) +} + +pub fn run_broker_impl(cfg: Config) -> Result<()> { + tracing::init(); + Ok(crate::crosvm::sys::windows::broker::run(cfg)?) +} + +pub fn initialize_sandbox() -> Result<()> { + if sandbox::is_sandbox_target() { + // Get the TargetServices pointer so that it gets initialized. + let _ = sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + Ok(()) +} + +#[cfg(feature = "kiwi")] +pub fn sandbox_lower_token() -> Result<()> { + if let Some(mut target) = sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")? + { + target.lower_token(); + } + Ok(()) +} + +fn report_dll_loaded(dll_name: String) { + let mut dll_load_details = EmulatorDllDetails::new(); + dll_load_details.set_dll_base_name(dll_name); + let mut details = RecordDetails::new(); + details.set_emulator_dll_details(dll_load_details); + metrics::log_event_with_details(MetricEventType::DllLoaded, &details); +} + +pub fn get_library_watcher( +) -> std::io::Result> { + let mut dlls: HashSet = HashSet::new(); + DllWatcher::new( + move |data| { + info!("DLL loaded: {:?}", data.base_dll_name); + if !dlls.insert(data.base_dll_name.clone()) && metrics::is_initialized() { + report_dll_loaded(data.base_dll_name.to_string_lossy().into_owned()); + } + }, + |data| info!("DLL unloaded: {:?}", data.base_dll_name), + ) +} + +pub(crate) fn start_device(command: DevicesSubcommand) -> Result<()> { + Err(anyhow!("unknown device name: {:?}", command)) +} + +pub(crate) fn run_vm_for_broker(args: Vec) -> Result<()> { + // This is a noop on unix. + initialize_sandbox()?; + let arguments = &[Argument::value( + "bootstrap", + "TRANSPORT_TUBE_RD", + "TubeTransporter descriptor used to bootstrap the main process.", + )]; + + let raw_transport_tube = set_bootstrap_arguments(args, arguments).exit_context( + Exit::InvalidSubCommandArgs, + "error in setting crosvm broker args", + )?; + let exit_state = crate::sys::windows::run_config_for_broker(raw_transport_tube.unwrap()); + crate::to_command_status(exit_state).map(|_| ()) +} + +pub(crate) fn set_bootstrap_arguments( + args: Vec, + arguments: &[Argument], +) -> std::result::Result, argument::Error> { + let mut raw_transport_tube = None; + crate::crosvm::argument::set_arguments(args.iter(), &arguments[..], |name, value| { + if name == "bootstrap" { + raw_transport_tube = Some(value.unwrap().parse::().or(Err( + argument::Error::InvalidValue { + value: value.unwrap().to_string(), + expected: String::from("a raw descriptor integer"), + }, + ))? as RawDescriptor); + } + Ok(()) + }) + .expect("Failed to set bootstrap arguments"); + Ok(raw_transport_tube) +} + +pub(crate) fn cleanup() { + // We've already cleaned everything up by waiting for all the vcpu threads on windows. + // TODO: b/142733266. When we sandbox each device, have a way to terminate the other sandboxed processes. +} + +fn run_broker(cmd: RunCommand) -> Result<()> { + match TryInto::::try_into(cmd) { + Ok(cfg) => run_broker_impl(cfg), + Err(e) => Err(anyhow!("{}", e)), + } +} + +pub(crate) fn run_command(cmd: Commands) -> anyhow::Result<()> { + match cmd { + Commands::RunMetrics(cmd) => run_metrics(cmd.args), + Commands::RunMP(cmd) => { + let mut x: Vec<&str> = vec![]; + for s in cmd.args.iter() { + if s == "backend=win_audio" { + x.push(&s.as_str()); + continue; + } + match s.split_once('=') { + Some((k, v)) => { + x.push(&k); + x.push(&v); + } + None => x.push(s.as_str()), + } + } + let cmd = RunCommand::from_args(&["run-mp"], &x); + match cmd { + Ok(cmd) => run_broker(cmd), + Err(e) => Err(anyhow!("Failed to create config: {:?}", e)), + } + } + Commands::RunMain(cmd) => run_vm_for_broker(cmd.args), + #[cfg(feature = "slirp")] + Commands::RunSlirp(cmd) => run_slirp(cmd.args), + } +} + +pub(crate) fn init_log(log_config: LogConfig, cfg: &Config) -> Result<()> +where + F: Fn(&mut base::syslog::fmt::Formatter, &log::Record<'_>) -> std::io::Result<()> + Sync + Send, +{ + if let Err(e) = syslog::init_with(LogConfig { + proc_name: if let Some(ref tag) = cfg.syslog_tag { + tag.to_string() + } else { + String::from("crosvm") + }, + pipe: if let Some(log_file_path) = &cfg.log_file { + let file = OpenOptions::new() + .create(true) + .append(true) + .open(log_file_path) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", log_file_path) + })?; + Some(Box::new(file)) + } else { + None + }, + stderr: if cfg.log_file.is_some() { false } else { true }, + ..log_config + }) { + eprintln!("failed to initialize syslog: {}", e); + return Err(anyhow!("failed to initialize syslog: {}", e)); + } + Ok(()) +} + +pub(crate) fn error_to_exit_code(res: &std::result::Result) -> i32 { + res.to_exit_code().unwrap_or(Exit::UnknownError.into()) +} diff --git a/src/sys/windows/metrics.rs b/src/sys/windows/metrics.rs new file mode 100644 index 0000000000..3a29a3bbd6 --- /dev/null +++ b/src/sys/windows/metrics.rs @@ -0,0 +1,93 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +cfg_if::cfg_if! { + if #[cfg(feature = "kiwi")] { + extern crate metrics as metrics_crate; + use anyhow::{Context}; + use broker_ipc::{common_child_setup, CommonChildStartupArgs}; + use base::Tube; + use std::thread; + use metrics_crate::MetricsController; + use crate::crosvm::sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}; + use crate::sys::windows::main::set_bootstrap_arguments; + use tube_transporter::{TubeToken, TubeTransporterReader}; + use base::FromRawDescriptor; + } +} + +#[cfg(feature = "kiwi")] +use crate::crosvm::argument::Argument; +use anyhow::Result; +pub(crate) use metrics::{ + get_destructor, log_descriptor, merge_session_invariants, set_auth_token, set_package_name, + MetricEventType, +}; + +pub(crate) fn run_metrics(#[allow(unused_variables)] args: Vec) -> Result<()> { + #[cfg(not(feature = "kiwi"))] + return Ok(()); + + #[cfg(feature = "kiwi")] + { + let arguments = &[Argument::value( + "bootstrap", + "TRANSPORT_TUBE_RD", + "TubeTransporter descriptor used to bootstrap the metrics process.", + )]; + + let raw_transport_tube = set_bootstrap_arguments(args, arguments).exit_context( + Exit::InvalidSubCommandArgs, + "error in setting crosvm metrics controller args", + )?; + + // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that the + // blocking & framing modes are accurate because we create them ourselves in the broker. + let tube_transporter = + unsafe { TubeTransporterReader::from_raw_descriptor(raw_transport_tube.unwrap()) }; + + let mut tube_data_list = tube_transporter + .read_tubes() + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + let bootstrap_tube = tube_data_list.get_tube(TubeToken::Bootstrap).unwrap(); + + let startup_args: CommonChildStartupArgs = + bootstrap_tube.recv::().unwrap(); + let _child_cleanup = common_child_setup(startup_args).exit_context( + Exit::CommonChildSetupError, + "failed to perform common child setup", + )?; + + let metrics_tubes = bootstrap_tube.recv::>().unwrap(); + + tracing::init(); + crate::sys::sandbox_lower_token()?; + + let mut metrics_controller = MetricsController::new(metrics_tubes); + metrics_controller + .run() + .exit_context(Exit::MetricsController, "metrics controller failed") + } +} + +pub(crate) fn setup_metrics_reporting() -> Result<()> { + #[cfg(not(feature = "kiwi"))] + return Ok(()); + + #[cfg(feature = "kiwi")] + { + let (metrics_controller_tube, metrics_agent_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + thread::spawn(move || { + let mut metrics_controller = MetricsController::new(vec![metrics_controller_tube]); + metrics_controller + .run() + .context("metrics controller failed") + .unwrap(); + }); + metrics::initialize(metrics_agent_tube); + Ok(()) + } +} diff --git a/src/sys/windows/panic_hook.rs b/src/sys/windows/panic_hook.rs new file mode 100644 index 0000000000..1d4ff5d91f --- /dev/null +++ b/src/sys/windows/panic_hook.rs @@ -0,0 +1,26 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::panic; +use std::process::abort; + +use crate::metrics; + +/// The intent of our panic hook is to get panic info and a stacktrace into the syslog, even for +/// jailed subprocesses. It will always abort on panic to ensure a minidump is generated. +/// +/// Note that jailed processes will usually have a stacktrace of because the backtrace +/// routines attempt to open this binary and are unable to do so in a jail. +pub fn set_panic_hook() { + let default_panic = panic::take_hook(); + panic::set_hook(Box::new(move |info| { + // Ensure all in-flight metrics are fully flushed + metrics::get_destructor().cleanup(); + // TODO(b/144724919): should update log_panic_info for this "cleanly exit crosvm" bug + // log_panic_info(default_panic.as_ref(), info); + default_panic(info); + // Abort to trigger the crash reporter so that a minidump is generated. + abort(); + })); +} diff --git a/src/sys/windows/run_vcpu.rs b/src/sys/windows/run_vcpu.rs new file mode 100644 index 0000000000..711d46101e --- /dev/null +++ b/src/sys/windows/run_vcpu.rs @@ -0,0 +1,922 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use anyhow::{anyhow, Result}; +use arch::{self, LinuxArch, RunnableLinuxVm, VcpuAffinity}; +use base::{ + self, error, info, set_audio_thread_priorities, set_cpu_affinity, warn, Event, + Result as BaseResult, SafeMultimediaHandle, SendTube, Timer, Tube, VmEventType, +}; +use std::{ + arch::x86_64::{__cpuid, __cpuid_count}, + fmt::Display, +}; + +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use { + aarch64::AArch64 as Arch, + devices::{IrqChip, IrqChipAArch64 as IrqChipArch}, + hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch}, +}; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use { + devices::IrqChipX86_64 as IrqChipArch, + hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch}, + x86_64::{adjust_cpuid, CpuIdContext, X8664arch as Arch}, +}; + +use crate::bail_exit_code; +use crate::crosvm::sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}; +use crate::crosvm::sys::windows::stats::{StatisticsCollector, VmExitStatistics}; +use crate::sys::windows::save_vcpu_tsc_offset; +use cros_async::{select2, EventAsync, Executor, SelectResult, TimerAsync}; +use devices::{Bus, TscSyncMitigations, VcpuRunState}; +use futures::pin_mut; +#[cfg(feature = "whpx")] +use hypervisor::whpx::WhpxVcpu; +use hypervisor::{ + HypervisorCap, IoEventAddress, IoOperation, IoParams, VcpuExit, VcpuInitX86_64, VcpuRunHandle, +}; +use std::convert::TryInto; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Barrier}; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; +use std::{fmt, thread}; +use sync::{Condvar, Mutex}; +use tracing::trace_event; +use vm_control::VmRunMode; +use winapi::shared::winerror::ERROR_RETRY; + +use crate::sys::windows::ExitState; + +const ERROR_RETRY_I32: i32 = ERROR_RETRY as i32; + +#[derive(Default)] +pub struct VcpuRunMode { + mtx: Mutex, + cvar: Condvar, +} + +impl VcpuRunMode { + pub fn set_and_notify(&self, new_mode: VmRunMode) { + *self.mtx.lock() = new_mode; + self.cvar.notify_all(); + } +} + +struct RunnableVcpuInfo { + vcpu: V, + thread_priority_handle: Option, + vcpu_run_handle: VcpuRunHandle, +} + +#[derive(Clone, Debug)] +struct VcpuMonitoringMetadata { + pub start_instant: Instant, + // Milliseconds since the baseline start_instant + pub last_run_time: Arc, + pub last_exit_snapshot: Arc>>, +} + +#[derive(Clone, Debug)] +struct VcpuRunThread { + pub cpu_id: usize, + pub monitoring_metadata: Option, +} + +impl VcpuRunThread { + pub fn new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread { + VcpuRunThread { + cpu_id, + monitoring_metadata: enable_vcpu_monitoring.then(|| VcpuMonitoringMetadata { + start_instant: Instant::now(), + last_run_time: Arc::new(AtomicU64::new(0)), + last_exit_snapshot: Arc::new(Mutex::new(Option::None)), + }), + } + } + + /// Perform WHPX-specific vcpu configurations + #[cfg(feature = "whpx")] + fn whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch) { + // only apply to actual WhpxVcpu instances + if let Some(whpx_vcpu) = vcpu.downcast_mut::() { + // WhpxVcpu instances need to know the TSC and Lapic frequencies to handle Hyper-V MSR reads + // and writes. + let tsc_freq = devices::tsc_frequency() + .map_err(|e| { + error!( + "Could not determine TSC frequency, WHPX vcpu will not be configured with \ + a TSC Frequency: {e}" + ); + e + }) + .ok(); + whpx_vcpu.set_frequencies(tsc_freq, irq_chip.lapic_frequency()); + } + } + + // Sets up a vcpu and converts it into a runnable vcpu. + fn runnable_vcpu( + cpu_id: usize, + vcpu: Option, + vcpu_init: VcpuInitX86_64, + vm: &impl VmArch, + irq_chip: &mut dyn IrqChipArch, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Option>, + no_smt: bool, + has_bios: bool, + host_cpu_topology: bool, + force_calibrated_tsc_leaf: bool, + ) -> Result> + where + V: VcpuArch, + { + let mut vcpu = match vcpu { + Some(v) => v, + None => { + // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from + // the vcpu thread. + match vm + .create_vcpu(cpu_id) + .exit_context(Exit::CreateVcpu, "failed to create vcpu")? + .downcast::() + { + Ok(v) => *v, + Err(_) => panic!("VM created wrong type of VCPU"), + } + } + }; + + irq_chip + .add_vcpu(cpu_id, &vcpu) + .exit_context(Exit::AddIrqChipVcpu, "failed to add vcpu to irq chip")?; + + if let Some(affinity) = vcpu_affinity { + if let Err(e) = set_cpu_affinity(affinity) { + error!("Failed to set CPU affinity: {}", e); + } + } + + Arch::configure_vcpu( + vm, + vm.get_hypervisor(), + irq_chip, + &mut vcpu, + vcpu_init, + cpu_id, + vcpu_count, + has_bios, + no_smt, + host_cpu_topology, + /* enable_pnp_data */ false, + /* itmt */ false, + force_calibrated_tsc_leaf, + ) + .exit_context(Exit::ConfigureVcpu, "failed to configure vcpu")?; + + #[cfg(feature = "whpx")] + Self::whpx_configure_vcpu(&mut vcpu, irq_chip); + + let mut thread_priority_handle = None; + if run_rt { + // Until we are multi process on Windows, we can't use the normal thread priority APIs; + // instead, we use a trick from the audio device which is able to set a thread RT even + // though the process itself is not RT. + thread_priority_handle = match set_audio_thread_priorities() { + Ok(hndl) => Some(hndl), + Err(e) => { + warn!("Failed to set vcpu thread to real time priority: {}", e); + None + } + }; + } + + let vcpu_run_handle = vcpu + .take_run_handle(None) + .exit_context(Exit::RunnableVcpu, "failed to set thread id for vcpu")?; + + Ok(RunnableVcpuInfo { + vcpu, + thread_priority_handle, + vcpu_run_handle, + }) + } + + pub fn run( + &self, + vcpu: Option, + vcpu_init: VcpuInitX86_64, + vcpus: Arc>>>, + vm: impl VmArch + 'static, + mut irq_chip: Box, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Option>, + delay_rt: bool, + no_smt: bool, + start_barrier: Arc, + vcpu_create_barrier: Arc, + has_bios: bool, + mut io_bus: devices::Bus, + mut mmio_bus: devices::Bus, + vm_evt_wrtube: SendTube, + requires_pvclock_ctrl: bool, + run_mode_arc: Arc, + stats: Option>>, + host_cpu_topology: bool, + tsc_offset: Option, + force_calibrated_tsc_leaf: bool, + ) -> Result>> + where + V: VcpuArch + 'static, + { + let context = self.clone(); + thread::Builder::new() + .name(format!("crosvm_vcpu{}", self.cpu_id)) + .spawn(move || { + // Having a closure returning ExitState guarentees that we + // send a VmEventType on all code paths after the closure + // returns. + let vcpu_fn = || -> Result { + let runnable_vcpu = Self::runnable_vcpu( + context.cpu_id, + vcpu, + vcpu_init, + &vm, + irq_chip.as_mut(), + vcpu_count, + run_rt && !delay_rt, + vcpu_affinity, + no_smt, + has_bios, + host_cpu_topology, + force_calibrated_tsc_leaf, + ); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let cpuid_context = CpuIdContext::new( + context.cpu_id, + vcpu_count, + no_smt, + host_cpu_topology, + Some(irq_chip.as_ref()), + /* enable_pnp_data */ false, + /* itmt */ false, + force_calibrated_tsc_leaf, + vm.get_hypervisor() + .check_capability(HypervisorCap::CalibratedTscLeafRequired), + __cpuid_count, + __cpuid, + ); + + // The vcpu_create_barrier is supplied from the main thread in order for it to + // wait until this thread is done creating its vcpu. + vcpu_create_barrier.wait(); + + // Wait for this barrier before continuing forward. + start_barrier.wait(); + + let RunnableVcpuInfo { + vcpu, + thread_priority_handle: _thread_priority_handle, + vcpu_run_handle, + } = runnable_vcpu?; + + if let Some(offset) = tsc_offset { + vcpu.set_tsc_offset(offset).unwrap_or_else(|e| { + error!( + "Failed to set tsc_offset of {} on vcpu {}: {}", + offset, context.cpu_id, e + ) + }); + } + + // Clone vcpu so it can be used by the main thread to force a vcpu run to exit + vcpus + .lock() + .push(Box::new(vcpu.try_clone().expect("Could not clone vcpu!"))); + + mmio_bus.set_access_id(context.cpu_id); + io_bus.set_access_id(context.cpu_id); + + vcpu_loop( + &context, + vcpu, + vm, + vcpu_run_handle, + irq_chip, + io_bus, + mmio_bus, + requires_pvclock_ctrl, + run_mode_arc, + stats, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + cpuid_context, + ) + }; + + let final_event_data = match vcpu_fn().unwrap_or_else(|e| { + error!("vcpu {} run loop exited with error: {}", context.cpu_id, e); + ExitState::Stop + }) { + ExitState::Stop => VmEventType::Exit, + _ => unreachable!(), + }; + vm_evt_wrtube + .send::(&final_event_data) + .unwrap_or_else(|e| { + error!( + "failed to send final event {:?} on vcpu {}: {}", + final_event_data, context.cpu_id, e + ) + }); + Ok(()) + }) + .exit_context(Exit::SpawnVcpu, "failed to spawn VCPU thread") + } +} + +#[derive(Clone, Debug)] +struct VcpuExitData { + // Represented by duration since baseline start_instant + exit_time: Duration, + exit_result: BaseResult, +} + +impl Display for VcpuExitData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "exit result: {:?}", self.exit_result) + } +} + +struct VcpuStallMonitor { + vcpu_run_threads: Vec, + run_mode: Arc, +} + +impl VcpuStallMonitor { + const HOST_STALL_TIMEOUT: Duration = Duration::from_secs(2); + const VCPU_CHECKUP_INTERVAL: Duration = Duration::from_secs(1); + const STALL_REPORTING_LIMITER: Duration = Duration::from_secs(10); + + pub fn init(run_mode: Arc) -> VcpuStallMonitor { + VcpuStallMonitor { + vcpu_run_threads: vec![], + run_mode, + } + } + + pub fn add_vcpu_thread(&mut self, thread: VcpuRunThread) { + self.vcpu_run_threads.push(thread); + } + + pub fn run(self, exit_event: &Event) -> Result>> { + let cloned_exit_event = exit_event + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?; + thread::Builder::new() + .name("crosvm_vcpu_stall_monitor".to_string()) + .spawn(move || { + let ex = Executor::new()?; + + let mut timer = TimerAsync::new(Timer::new()?, &ex)?; + let mut reset_timer = true; + + let exit_evt_async = EventAsync::new(cloned_exit_event, &ex)?; + let exit_future = exit_evt_async.next_val(); + pin_mut!(exit_future); + 'main: loop { + if reset_timer { + timer.reset( + Self::VCPU_CHECKUP_INTERVAL, + Some(Self::VCPU_CHECKUP_INTERVAL), + )?; + reset_timer = false; + } + let timer_future = timer.next_val(); + pin_mut!(timer_future); + match ex.run_until(select2(timer_future, exit_future)) { + Ok((timer_result, exit_result)) => { + match exit_result { + SelectResult::Finished(_) => { + info!("vcpu monitor got exit event"); + break 'main; + } + SelectResult::Pending(future) => exit_future = future, + } + + match timer_result { + SelectResult::Finished(Err(e)) => { + error!( + "vcpu monitor aborting due to error awaiting future: {}", + e + ); + break 'main; + } + SelectResult::Finished(_) => self.report_any_stalls(), + _ => (), + } + } + Err(e) => { + error!("vcpu monitor failed to wait on future set: {:?}", e); + break 'main; + } + } + + // Always ensure the vcpus aren't suspended before continuing to montior. + let mut run_mode_lock = self.run_mode.mtx.lock(); + loop { + match *run_mode_lock { + VmRunMode::Running => break, + VmRunMode::Suspending | VmRunMode::Breakpoint => { + info!("vcpu monitor pausing until end of suspension"); + run_mode_lock = self.run_mode.cvar.wait(run_mode_lock); + reset_timer = true; + } + VmRunMode::Exiting => { + info!("vcpu monitor detected vm exit"); + break 'main; + } + } + } + } + + Ok(()) + }) + .exit_context( + Exit::SpawnVcpuMonitor, + "failed to spawn VCPU stall monitor thread", + ) + } + + fn report_any_stalls(&self) { + // TODO(b/208267651): Add and fire Clearcut events for stalls (and add tests) + // TODO(b/208267651): Also test guest stalls (vcpu.run() goes too long without exiting) + let now = Instant::now(); + for vcpu_thread in self.vcpu_run_threads.iter() { + let monitoring_metadata = vcpu_thread.monitoring_metadata.as_ref().unwrap(); + if let Some(ref exit_snapshot) = monitoring_metadata.last_exit_snapshot.lock().clone() { + let last_run = + Duration::from_millis(monitoring_metadata.last_run_time.load(Ordering::SeqCst)); + if last_run < exit_snapshot.exit_time { + // VCPU is between runs + let time_since_exit = now.saturating_duration_since( + monitoring_metadata.start_instant + exit_snapshot.exit_time, + ); + if time_since_exit > Self::HOST_STALL_TIMEOUT { + self.report_stall(vcpu_thread.cpu_id, exit_snapshot, time_since_exit); + } + } + }; + } + } + + fn report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration) { + if stall_time > Self::STALL_REPORTING_LIMITER { + return; + } + // Double check the Vm is running. We don't care about stalls during suspension/exit + if *self.run_mode.mtx.lock() != VmRunMode::Running { + let duration_string = format!("{:.1}sec", stall_time.as_secs_f32()); + error!( + "Host stall for {} on VCPU {} exit while handling: {}", + duration_string, cpu_id, exit_data, + ); + } + } +} + +fn setup_vcpu_signal_handler() -> Result<()> { + Ok(()) +} + +pub fn run_all_vcpus( + vcpus: Vec>, + vcpu_boxes: Arc>>>, + guest_os: &RunnableLinuxVm, + exit_evt: &Event, + vm_evt_wrtube: &SendTube, + pvclock_host_tube: &Option, + stats: &Option>>, + host_cpu_topology: bool, + run_mode_arc: Arc, + tsc_sync_mitigations: TscSyncMitigations, + force_calibrated_tsc_leaf: bool, +) -> Result>>> { + let mut vcpu_threads = Vec::with_capacity(guest_os.vcpu_count + 1); + let start_barrier = Arc::new(Barrier::new(guest_os.vcpu_count + 1)); + let enable_vcpu_monitoring = anti_tamper::enable_vcpu_monitoring(); + setup_vcpu_signal_handler()?; + + let mut stall_monitor = + enable_vcpu_monitoring.then(|| VcpuStallMonitor::init(run_mode_arc.clone())); + for (cpu_id, vcpu) in vcpus.into_iter().enumerate() { + let vcpu_affinity = match guest_os.vcpu_affinity.clone() { + Some(VcpuAffinity::Global(v)) => Some(v), + Some(VcpuAffinity::PerVcpu(mut m)) => Some(m.remove(&cpu_id).unwrap_or_default()), + None => None, + }; + + // TSC sync mitigations may set vcpu affinity and set a TSC offset + let (vcpu_affinity, tsc_offset): (Option>, Option) = + if let Some(mitigation_affinity) = tsc_sync_mitigations.get_vcpu_affinity(cpu_id) { + if vcpu_affinity.is_none() { + ( + Some(mitigation_affinity), + tsc_sync_mitigations.get_vcpu_tsc_offset(cpu_id), + ) + } else { + error!( + "Core affinity {:?} specified via commandline conflicts and overrides \ + affinity needed for TSC sync mitigation: {:?}.", + vcpu_affinity, mitigation_affinity + ); + (vcpu_affinity, None) + } + } else { + (vcpu_affinity, None) + }; + + let vcpu_init = &guest_os.vcpu_init[cpu_id]; + // The vcpu_create_barrier allows the main thread to delay the spawning of additional + // vcpu threads until a single vcpu thread spawned has finished creating it's vcpu. + // We currently use this to allow creation of 1 vcpu at a time for all hypervisors. + // There are issues with multiple hypervisors with this approach: + // - Windows 11 has a regression which causes a BSOD with creation of multiple vcpu + // in parallel. http://b/229635845 for more details. + // - GHAXM/HAXM cannot create vcpu0 in parallel with other Vcpus. + let vcpu_create_barrier = Arc::new(Barrier::new(2)); + let vcpu_run_thread = VcpuRunThread::new(cpu_id, enable_vcpu_monitoring); + let join_handle = vcpu_run_thread.run( + vcpu, + vcpu_init.clone(), + vcpu_boxes.clone(), + guest_os + .vm + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone vm")?, + guest_os + .irq_chip + .try_box_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?, + guest_os.vcpu_count, + guest_os.rt_cpus.contains(&cpu_id), + vcpu_affinity, + guest_os.delay_rt, + guest_os.no_smt, + start_barrier.clone(), + vcpu_create_barrier.clone(), + guest_os.has_bios, + (*guest_os.io_bus).clone(), + (*guest_os.mmio_bus).clone(), + vm_evt_wrtube + .try_clone() + .exit_context(Exit::CloneTube, "failed to clone tube")?, + pvclock_host_tube.is_none(), + run_mode_arc.clone(), + stats.clone(), + host_cpu_topology, + tsc_offset, + force_calibrated_tsc_leaf, + )?; + if let Some(ref mut monitor) = stall_monitor { + monitor.add_vcpu_thread(vcpu_run_thread); + } + + // Wait until the vcpu is created before we start a new vcpu thread + vcpu_create_barrier.wait(); + + vcpu_threads.push(join_handle); + } + if let Some(monitor) = stall_monitor { + vcpu_threads.push(monitor.run(exit_evt)?); + } + // Now wait on the start barrier to start all threads at the same time. + start_barrier.wait(); + Ok(vcpu_threads) +} + +fn vcpu_loop( + context: &VcpuRunThread, + mut vcpu: V, + vm: impl VmArch + 'static, + vcpu_run_handle: VcpuRunHandle, + irq_chip: Box, + mut io_bus: Bus, + mut mmio_bus: Bus, + requires_pvclock_ctrl: bool, + run_mode_arc: Arc, + stats: Option>>, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] cpuid_context: CpuIdContext, +) -> Result +where + V: VcpuArch + 'static, +{ + let mut exit_stats = VmExitStatistics::new(); + mmio_bus.stats.set_enabled(stats.is_some()); + io_bus.stats.set_enabled(stats.is_some()); + exit_stats.set_enabled(stats.is_some()); + + let mut save_tsc_offset = true; + + loop { + let _trace_event = trace_event!(crosvm, "vcpu loop"); + let mut check_vm_shutdown = false; + + match irq_chip.wait_until_runnable(&vcpu).with_exit_context( + Exit::WaitUntilRunnable, + || { + format!( + "error waiting for vcpu {} to become runnable", + context.cpu_id + ) + }, + )? { + VcpuRunState::Runnable => {} + VcpuRunState::Interrupted => check_vm_shutdown = true, + } + + if !check_vm_shutdown { + let exit = { + let _trace_event = trace_event!(crosvm, "vcpu::run"); + if let Some(ref monitoring_metadata) = context.monitoring_metadata { + monitoring_metadata.last_run_time.store( + // Safe conversion because millis will always be < u32::MAX + monitoring_metadata + .start_instant + .elapsed() + .as_millis() + .try_into() + .unwrap(), + Ordering::SeqCst, + ); + } + vcpu.run(&vcpu_run_handle) + }; + if let Some(ref monitoring_metadata) = context.monitoring_metadata { + *monitoring_metadata.last_exit_snapshot.lock() = Some(VcpuExitData { + exit_time: monitoring_metadata.start_instant.elapsed(), + exit_result: exit, + }); + } + + // save the tsc offset if we need to + if save_tsc_offset { + if let Ok(offset) = vcpu.get_tsc_offset() { + save_vcpu_tsc_offset(offset, context.cpu_id); + } else { + error!("Unable to determine TSC offset"); + } + save_tsc_offset = false; + } + + let start = exit_stats.start_stat(); + + match exit { + Ok(VcpuExit::Io) => { + let _trace_event = trace_event!(crosvm, "VcpuExit::Io"); + vcpu.handle_io(&mut |IoParams { address, mut size, operation}| { + match operation { + IoOperation::Read => { + let mut data = [0u8; 8]; + if size > data.len() { + error!("unsupported IoIn size of {} bytes", size); + size = data.len(); + } + io_bus.read(address, &mut data[..size]); + Some(data) + } + IoOperation::Write { data } => { + if size > data.len() { + error!("unsupported IoOut size of {} bytes", size); + size = data.len() + } + vm.handle_io_events(IoEventAddress::Pio(address), &data[..size]) + .unwrap_or_else(|e| error!( + "failed to handle ioevent for pio write to {} on vcpu {}: {}", + address, context.cpu_id, e + )); + io_bus.write(address, &data[..size]); + None + } + } + }).unwrap_or_else(|e| error!("failed to handle io: {}", e)); + } + Ok(VcpuExit::Mmio) => { + let _trace_event = trace_event!(crosvm, "VcpuExit::Mmio"); + vcpu.handle_mmio(&mut |IoParams { address, mut size, operation }| { + match operation { + IoOperation::Read => { + let mut data = [0u8; 8]; + if size > data.len() { + error!("unsupported MmioRead size of {} bytes", size); + size = data.len(); + } + { + let data = &mut data[..size]; + if !mmio_bus.read(address, data) { + info!( + "mmio read failed: {:x}; trying memory read..", + address + ); + vm.get_memory() + .read_exact_at_addr( + data, + vm_memory::GuestAddress(address), + ) + .unwrap_or_else(|e| { + error!( + "guest memory read failed at {:x}: {}", + address, e + ) + }); + } + } + Some(data) + } + IoOperation::Write { data } => { + if size > data.len() { + error!("unsupported MmioWrite size of {} bytes", size); + size = data.len() + } + let data = &data[..size]; + vm.handle_io_events(IoEventAddress::Mmio(address), data) + .unwrap_or_else(|e| error!( + "failed to handle ioevent for mmio write to {} on vcpu {}: {}", + address, context.cpu_id, e + )); + if !mmio_bus.write(address, data) { + info!( + "mmio write failed: {:x}; trying memory write..", + address + ); + vm.get_memory() + .write_all_at_addr(data, vm_memory::GuestAddress(address)) + .unwrap_or_else(|e| error!( + "guest memory write failed at {:x}: {}", + address, e + )); + } + None + } + } + }).unwrap_or_else(|e| error!("failed to handle mmio: {}", e)); + } + Ok(VcpuExit::IoapicEoi { vector }) => { + irq_chip.broadcast_eoi(vector).unwrap_or_else(|e| { + error!( + "failed to broadcast eoi {} on vcpu {}: {}", + vector, context.cpu_id, e + ) + }); + } + Ok(VcpuExit::IrqWindowOpen) => {} + Ok(VcpuExit::Hlt) => irq_chip.halted(context.cpu_id), + + // VcpuExit::Shutdown is always an error on Windows. HAXM exits with + // Shutdown only for triple faults and other vcpu panics. WHPX never exits + // with Shutdown. Normal reboots and shutdowns, like window close, use + // the vm event tube and VmRunMode::Exiting instead of VcpuExit::Shutdown. + Ok(VcpuExit::Shutdown) => bail_exit_code!(Exit::VcpuShutdown, "vcpu shutdown"), + Ok(VcpuExit::FailEntry { + hardware_entry_failure_reason, + }) => bail_exit_code!( + Exit::VcpuFailEntry, + "vcpu hw run failure: {:#x}", + hardware_entry_failure_reason, + ), + Ok(VcpuExit::SystemEventShutdown) => { + bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventShutdown") + } + Ok(VcpuExit::SystemEventReset) => { + bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventReset") + } + Ok(VcpuExit::SystemEventCrash) => { + bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventCrash") + } + + // When we're shutting down (e.g., emulator window gets closed), GVM vmexits + // with KVM_EXIT_INTR, which vcpu.run maps to VcpuExit::Intr. But KVM_EXIT_INTR + // can happen during normal operation too, when GVM's timer finds requests + // pending from the host. So we set check_vm_shutdown, then below check the + // VmRunMode state to see if we should exit the run loop. + Ok(VcpuExit::Intr) => check_vm_shutdown = true, + Ok(VcpuExit::Canceled) => check_vm_shutdown = true, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Ok(VcpuExit::Cpuid { mut entry }) => { + let _trace_event = trace_event!(crosvm, "VcpuExit::Cpuid"); + // adjust the results based on crosvm logic + adjust_cpuid(&mut entry, &cpuid_context); + + // let the vcpu finish handling the exit + vcpu.handle_cpuid(&entry).unwrap_or_else(|e| { + error!( + "failed to handle setting cpuid results on cpu {}: {}", + context.cpu_id, e + ) + }); + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Ok(VcpuExit::MsrAccess) => {} // MsrAccess handled by hypervisor impl + Ok(r) => { + error!("unexpected vcpu.run return value: {:?}", r); + check_vm_shutdown = true; + } + Err(e) => match e.errno() { + ERROR_RETRY_I32 => {} + _ => { + run_mode_arc.set_and_notify(VmRunMode::Exiting); + Err(e).exit_context(Exit::VcpuRunError, "vcpu run error")?; + } + }, + } + + exit_stats.end_stat(&exit, start); + } + + if check_vm_shutdown { + let mut run_mode_lock = run_mode_arc.mtx.lock(); + loop { + match *run_mode_lock { + VmRunMode::Running => break, + VmRunMode::Suspending => { + // On KVM implementations that use a paravirtualized clock (e.g. + // x86), a flag must be set to indicate to the guest kernel that + // a VCPU was suspended. The guest kernel will use this flag to + // prevent the soft lockup detection from triggering when this + // VCPU resumes, which could happen days later in realtime. + if requires_pvclock_ctrl { + vcpu.pvclock_ctrl().unwrap_or_else(|e| error!( + "failed to signal to hypervisor that vcpu {} is being suspended: {}", + context.cpu_id, e + )); + } + } + VmRunMode::Breakpoint => {} + VmRunMode::Exiting => { + if let Some(stats) = stats { + let mut collector = stats.lock(); + collector.pio_bus_stats.push(io_bus.stats); + collector.mmio_bus_stats.push(mmio_bus.stats); + collector.vm_exit_stats.push(exit_stats); + } + return Ok(ExitState::Stop); + } + } + // Give ownership of our exclusive lock to the condition variable that + // will block. When the condition variable is notified, `wait` will + // unblock and return a new exclusive lock. + run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock); + } + } + + irq_chip.inject_interrupts(&vcpu).unwrap_or_else(|e| { + error!( + "failed to inject interrupts for vcpu {}: {}", + context.cpu_id, e + ) + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct SetupData { + pub monitor: VcpuStallMonitor, + pub exit_evt: Event, + } + + fn set_up_stall_monitor(vcpu_count: usize) -> Result { + let run_mode = Arc::new(VcpuRunMode::default()); + let mut monitor = VcpuStallMonitor::init(run_mode.clone()); + + for id in 0..vcpu_count { + let new_vcpu = VcpuRunThread::new(id, true /* enable_vcpu_monitoring */); + monitor.add_vcpu_thread(new_vcpu); + } + + Ok(SetupData { + monitor, + exit_evt: Event::new().expect("Failed to create event"), + }) + } + + #[test] + fn stall_monitor_closes_on_exit_evt() -> Result<()> { + let SetupData { monitor, exit_evt } = set_up_stall_monitor(1)?; + + let _ = exit_evt.write(1)?; + let _ = monitor + .run(&exit_evt)? + .join() + .unwrap_or_else(|e| panic!("Thread join failed: {:?}", e)); + Ok(()) + } +}