diff --git a/Cargo.toml b/Cargo.toml index 1ca956270a..a886090bcd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,7 @@ lto = true members = [ "aarch64", "acpi_tables", + "anti_tamper", "arch", "argh_helpers", "base", diff --git a/anti_tamper/Cargo.toml b/anti_tamper/Cargo.toml new file mode 100644 index 0000000000..0e85191a20 --- /dev/null +++ b/anti_tamper/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "anti_tamper" +version = "0.1.0" +authors = ["The Chromium OS Authors"] +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +anti-tamper = [] +proto-tube-hack = [] + +[dependencies] +base = { path = "../base" } diff --git a/anti_tamper/src/lib.rs b/anti_tamper/src/lib.rs new file mode 100644 index 0000000000..e091eb634b --- /dev/null +++ b/anti_tamper/src/lib.rs @@ -0,0 +1,7 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +pub mod noop; + +pub use noop::*; diff --git a/anti_tamper/src/noop.rs b/anti_tamper/src/noop.rs new file mode 100644 index 0000000000..bee5b73ae3 --- /dev/null +++ b/anti_tamper/src/noop.rs @@ -0,0 +1,35 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::thread; + +use base::Tube; + +pub fn setup_common_metric_invariants( + _product_version: &Option, + _product_channel: &Option, + _use_vulkan: &Option, +) { +} + +#[cfg(feature = "proto-tube-hack")] +pub fn forward_security_challenge(_recv: &Tube, _sender: &Tube) {} + +#[cfg(feature = "proto-tube-hack")] +pub fn forward_security_signal(_recv: &Tube, _sender: &Tube) {} + +pub fn enable_vcpu_monitoring() -> bool { + false +} + +// This is a hard limit as it is used to set the Tube buffer size, and will +// deadlock if exceeded (b/223807352). +pub const MAX_CHALLENGE_SIZE: usize = 1; + +pub fn spawn_dedicated_anti_tamper_thread( + #[cfg(not(feature = "proto-tube-hack"))] _tube_to_main_thread: Tube, + #[cfg(feature = "proto-tube-hack")] _tube_to_main_thread: base::Tube, +) -> thread::JoinHandle<()> { + thread::spawn(move || {}) +} diff --git a/src/crosvm/cmdline.rs b/src/crosvm/cmdline.rs index 7321d21b2d..eb3ca571d3 100644 --- a/src/crosvm/cmdline.rs +++ b/src/crosvm/cmdline.rs @@ -17,6 +17,9 @@ cfg_if::cfg_if! { parse_coiommu_params, VfioCommand, parse_vfio, parse_vfio_platform, }; use super::config::SharedDir; + } else if #[cfg(windows)] { + use crate::crosvm::sys::config::IrqChipKind; + } } @@ -514,6 +517,10 @@ pub struct RunCommand { /// num_input_streams=INT - Set number of input PCM streams /// per device. pub cras_snds: Vec, + #[cfg(feature = "crash-report")] + #[argh(option, long = "crash-pipe-name", arg_name = "\\\\.\\pipe\\PIPE_NAME")] + /// the crash handler ipc pipe name. + pub crash_pipe_name: Option, #[argh(switch)] /// don't set VCPUs real-time until make-rt command is run pub delay_rt: bool, @@ -584,6 +591,10 @@ pub struct RunCommand { #[argh(positional, arg_name = "KERNEL")] /// bzImage of kernel to run pub executable_path: Option, + #[cfg(windows)] + #[argh(switch, long = "exit-stats")] + /// gather and display statistics on Vm Exits and Bus Reads/Writes. + pub exit_stats: bool, #[argh( option, long = "file-backed-mapping", @@ -674,6 +685,10 @@ pub struct RunCommand { #[argh(switch)] /// use mirror cpu topology of Host for Guest VM, also copy some cpu feature to Guest VM pub host_cpu_topology: bool, + #[cfg(windows)] + #[argh(option, long = "host-guid", arg_name = "PATH")] + /// string representation of the host guid in registry format, for namespacing vsock connections. + pub host_guid: Option, #[cfg(unix)] #[argh(option, arg_name = "IP")] /// IP address to assign to host tap interface @@ -687,9 +702,17 @@ pub struct RunCommand { #[argh(option, short = 'i', long = "initrd", arg_name = "PATH")] /// initial ramdisk to load pub initrd_path: Option, + #[cfg(windows)] + #[argh(option, long = "irqchip", arg_name = "kernel|split|userspace")] + /// type of interrupt controller emulation. \"split\" is only available for x86 KVM. + pub irq_chip: Option, #[argh(switch)] /// allow to enable ITMT scheduling feature in VM. The success of enabling depends on HWP and ACPI CPPC support on hardware pub itmt: bool, + #[cfg(windows)] + #[argh(option, long = "kernel-log-file", arg_name = "PATH")] + /// forward hypervisor kernel driver logs for this VM to a file. + pub kernel_log_file: Option, #[cfg(unix)] #[argh(option, long = "kvm-device", arg_name = "PATH")] /// path to the KVM device. (default /dev/kvm) @@ -698,6 +721,14 @@ pub struct RunCommand { #[argh(switch)] /// disable host swap on guest VM pages. pub lock_guest_memory: bool, + #[cfg(windows)] + #[argh(option, long = "log-file", arg_name = "PATH")] + /// redirect logs to the supplied log file at PATH rather than stderr. For multi-process mode, use --logs-directory instead + pub log_file: Option, + #[cfg(windows)] + #[argh(option, long = "logs-directory", arg_name = "PATH")] + /// path to the logs directory used for crosvm processes. Logs will be sent to stderr if unset, and stderr/stdout will be uncaptured + pub logs_directory: Option, #[cfg(unix)] #[argh(option, arg_name = "MAC", long = "mac")] /// MAC address for VM @@ -802,6 +833,26 @@ pub struct RunCommand { #[argh(switch)] /// grant this Guest VM certian privileges to manage Host resources, such as power management pub privileged_vm: bool, + #[cfg(feature = "process-invariants")] + #[argh(option, long = "process-invariants-handle", arg_name = "PATH")] + /// shared read-only memory address for a serialized EmulatorProcessInvariants proto + pub process_invariants_data_handle: Option, + #[cfg(feature = "process-invariants")] + #[argh(option, long = "process-invariants-size", arg_name = "PATH")] + /// size of the serialized EmulatorProcessInvariants proto pointed at by process-invariants-handle + pub process_invariants_data_size: Option, + #[cfg(windows)] + #[argh(option, long = "product-channel")] + /// product channel + pub product_channel: Option, + #[cfg(feature = "crash-report")] + #[argh(option, long = "product-name")] + /// the product name for file paths. + pub product_name: Option, + #[cfg(windows)] + #[argh(option, long = "product-version")] + /// product version + pub product_version: Option, #[argh(switch)] /// prevent host access to guest memory pub protected_vm: bool, @@ -812,6 +863,10 @@ pub struct RunCommand { /// path to pstore buffer backend file followed by size /// [--pstore ] pub pstore: Option, + #[cfg(windows)] + #[argh(switch)] + /// enable virtio-pvclock. + pub pvclock: bool, // Must be `Some` iff `protected_vm == ProtectionType::UnprotectedWithFirmware`. #[argh(option, long = "unprotected-vm-with-firmware", arg_name = "PATH")] /// (EXPERIMENTAL/FOR DEBUGGING) Use VM firmware, but allow host access to guest memory @@ -913,6 +968,10 @@ pub struct RunCommand { /// Can only be given once. Will default to first serial /// port if not provided. pub serial_parameters: Vec, + #[cfg(feature = "kiwi")] + #[argh(option, long = "service-pipe-name", arg_name = "PIPE_NAME")] + /// the service ipc pipe name. (Prefix \\\\.\\pipe\\ not needed. + pub service_pipe_name: Option, #[cfg(unix)] #[argh( option, @@ -966,6 +1025,10 @@ pub struct RunCommand { /// when the underlying file system supports POSIX ACLs. /// The default value for this option is "true". pub shared_dirs: Vec, + #[cfg(feature = "slirp-ring-capture")] + #[argh(option, long = "slirp-capture-file", arg_name = "PATH")] + /// Redirects slirp network packets to the supplied log file rather than the current directory as `slirp_capture_packets.pcap` + pub slirp_capture_file: Option, #[argh(option, short = 's', long = "socket", arg_name = "PATH")] /// path to put the control socket. If PATH is a directory, a name will be generated pub socket_path: Option, @@ -1391,6 +1454,36 @@ impl TryFrom for super::config::Config { cfg.pmem_devices.push(pmem); } + #[cfg(windows)] + { + #[cfg(feature = "crash-report")] + { + cfg.product_name = cmd.product_name; + + cfg.crash_pipe_name = cmd.crash_pipe_name; + } + cfg.exit_stats = cmd.exit_stats; + cfg.host_guid = cmd.host_guid; + cfg.irq_chip = cmd.irq_chip; + cfg.kernel_log_file = cmd.kernel_log_file; + cfg.log_file = cmd.log_file; + cfg.logs_directory = cmd.logs_directory; + #[cfg(feature = "process-invariants")] + { + cfg.process_invariants_data_handle = cmd.process_invariants_data_handle; + + cfg.process_invariants_data_size = cmd.process_invariants_data_size; + } + cfg.pvclock = cmd.pvclock; + cfg.service_pipe_name = cmd.service_pipe_name; + #[cfg(feature = "slirp-ring-capture")] + { + cfg.slirp_capture_file = cmd.slirp_capture_file; + } + cfg.syslog_tag = cmd.syslog_tag; + cfg.product_channel = cmd.product_channel; + cfg.product_version = cmd.product_version; + } cfg.pstore = cmd.pstore; #[cfg(unix)] diff --git a/src/crosvm/config.rs b/src/crosvm/config.rs index cce2cfc8cb..40d7c8e863 100644 --- a/src/crosvm/config.rs +++ b/src/crosvm/config.rs @@ -35,7 +35,7 @@ use x86_64::{set_enable_pnp_data_msr_config, set_itmt_msr_config}; #[cfg(feature = "audio")] use devices::{Ac97Backend, Ac97Parameters}; -use super::{argument::parse_hex_or_decimal, check_opt_path}; +use super::{argument::parse_hex_or_decimal, check_opt_path, sys::HypervisorKind}; cfg_if::cfg_if! { if #[cfg(unix)] { @@ -49,6 +49,10 @@ cfg_if::cfg_if! { static KVM_PATH: &str = "/dev/kvm"; static VHOST_NET_PATH: &str = "/dev/vhost-net"; static SECCOMP_POLICY_DIR: &str = "/usr/share/policy/crosvm"; + } else if #[cfg(windows)] { + use base::{Event, Tube}; + + use crate::crosvm::sys::windows::config::IrqChipKind; } } @@ -1236,11 +1240,21 @@ pub struct Config { pub balloon_bias: i64, pub balloon_control: Option, pub battery_type: Option, + #[cfg(windows)] + pub block_control_tube: Vec, + #[cfg(windows)] + pub block_vhost_user_tube: Vec, + #[cfg(windows)] + pub broker_shutdown_event: Option, pub cid: Option, #[cfg(unix)] pub coiommu_param: Option, pub cpu_capacity: BTreeMap, // CPU index -> capacity pub cpu_clusters: Vec>, + #[cfg(feature = "crash-report")] + pub crash_pipe_name: Option, + #[cfg(feature = "crash-report")] + pub crash_report_uuid: Option, pub delay_rt: bool, #[cfg(feature = "direct")] pub direct_edge_irq: Vec, @@ -1259,6 +1273,8 @@ pub struct Config { pub dmi_path: Option, pub enable_pnp_data: bool, pub executable_path: Option, + #[cfg(windows)] + pub exit_stats: bool, pub file_backed_mappings: Vec, pub force_calibrated_tsc_leaf: bool, pub force_s2idle: bool, @@ -1269,20 +1285,33 @@ pub struct Config { #[cfg(all(unix, feature = "gpu"))] pub gpu_render_server_parameters: Option, pub host_cpu_topology: bool, + #[cfg(windows)] + pub host_guid: Option, pub host_ip: Option, pub hugepages: bool, + pub hypervisor: Option, pub init_memory: Option, pub initrd_path: Option, + #[cfg(windows)] + pub irq_chip: Option, pub itmt: bool, pub jail_config: Option, + #[cfg(windows)] + pub kernel_log_file: Option, #[cfg(unix)] pub kvm_device_path: PathBuf, #[cfg(unix)] pub lock_guest_memory: bool, + #[cfg(windows)] + pub log_file: Option, + #[cfg(windows)] + pub logs_directory: Option, pub mac_address: Option, pub memory: Option, pub memory_file: Option, pub mmio_address_ranges: Vec, + #[cfg(windows)] + pub net_vhost_user_tube: Option, pub net_vq_pairs: Option, pub netmask: Option, pub no_i8042: bool, @@ -1302,17 +1331,33 @@ pub struct Config { pub plugin_root: Option, pub pmem_devices: Vec, pub privileged_vm: bool, + #[cfg(feature = "process-invariants")] + pub process_invariants_data_handle: Option, + #[cfg(feature = "process-invariants")] + pub process_invariants_data_size: Option, + #[cfg(feature = "crash-report")] + pub product_channel: Option, + #[cfg(windows)] + pub product_name: Option, + #[cfg(windows)] + pub product_version: Option, pub protected_vm: ProtectionType, pub pstore: Option, + #[cfg(windows)] + pub pvclock: bool, /// Must be `Some` iff `protected_vm == ProtectionType::UnprotectedWithFirmware`. pub pvm_fw: Option, pub rng: bool, pub rt_cpus: Vec, #[serde(with = "serde_serial_params")] pub serial_parameters: BTreeMap<(SerialHardware, u8), SerialParameters>, + #[cfg(feature = "kiwi")] + pub service_pipe_name: Option, #[cfg(unix)] #[serde(skip)] pub shared_dirs: Vec, + #[cfg(feature = "slirp-ring-capture")] + pub slirp_capture_file: Option, pub socket_path: Option, #[cfg(feature = "tpm")] pub software_tpm: bool, @@ -1322,6 +1367,8 @@ pub struct Config { pub strict_balloon: bool, pub stub_pci_devices: Vec, pub swiotlb: Option, + #[cfg(windows)] + pub syslog_tag: Option, #[cfg(unix)] pub tap_fd: Vec, pub tap_name: Vec, @@ -1381,9 +1428,19 @@ impl Default for Config { balloon_bias: 0, balloon_control: None, battery_type: None, + #[cfg(windows)] + block_control_tube: Vec::new(), + #[cfg(windows)] + block_vhost_user_tube: Vec::new(), + #[cfg(windows)] + broker_shutdown_event: None, cid: None, #[cfg(unix)] coiommu_param: None, + #[cfg(feature = "crash-report")] + crash_pipe_name: None, + #[cfg(feature = "crash-report")] + crash_report_uuid: None, cpu_capacity: BTreeMap::new(), cpu_clusters: Vec::new(), delay_rt: false, @@ -1404,6 +1461,8 @@ impl Default for Config { dmi_path: None, enable_pnp_data: false, executable_path: None, + #[cfg(windows)] + exit_stats: false, file_backed_mappings: Vec::new(), force_calibrated_tsc_leaf: false, force_s2idle: false, @@ -1414,24 +1473,41 @@ impl Default for Config { #[cfg(all(unix, feature = "gpu"))] gpu_render_server_parameters: None, host_cpu_topology: false, + #[cfg(windows)] + host_guid: None, host_ip: None, + #[cfg(windows)] + product_version: None, + #[cfg(windows)] + product_channel: None, hugepages: false, + hypervisor: None, init_memory: None, initrd_path: None, + #[cfg(windows)] + irq_chip: None, itmt: false, jail_config: if !cfg!(feature = "default-no-sandbox") { Some(Default::default()) } else { None }, + #[cfg(windows)] + kernel_log_file: None, #[cfg(unix)] kvm_device_path: PathBuf::from(KVM_PATH), #[cfg(unix)] lock_guest_memory: false, + #[cfg(windows)] + log_file: None, + #[cfg(windows)] + logs_directory: None, mac_address: None, memory: None, memory_file: None, mmio_address_ranges: Vec::new(), + #[cfg(windows)] + net_vhost_user_tube: None, net_vq_pairs: None, netmask: None, no_i8042: false, @@ -1451,14 +1527,26 @@ impl Default for Config { plugin_root: None, pmem_devices: Vec::new(), privileged_vm: false, + #[cfg(feature = "process-invariants")] + process_invariants_data_handle: None, + #[cfg(feature = "process-invariants")] + process_invariants_data_size: None, + #[cfg(feature = "crash-report")] + product_name: None, protected_vm: ProtectionType::Unprotected, pstore: None, + #[cfg(windows)] + pvclock: false, pvm_fw: None, rng: true, rt_cpus: Vec::new(), serial_parameters: BTreeMap::new(), + #[cfg(feature = "kiwi")] + service_pipe_name: None, #[cfg(unix)] shared_dirs: Vec::new(), + #[cfg(feature = "slirp-ring-capture")] + slirp_capture_file: None, socket_path: None, #[cfg(feature = "tpm")] software_tpm: false, @@ -1468,6 +1556,8 @@ impl Default for Config { strict_balloon: false, stub_pci_devices: Vec::new(), swiotlb: None, + #[cfg(windows)] + syslog_tag: None, #[cfg(unix)] tap_fd: Vec::new(), tap_name: Vec::new(), diff --git a/src/crosvm/sys.rs b/src/crosvm/sys.rs index 1a855388da..2ade3d9c99 100644 --- a/src/crosvm/sys.rs +++ b/src/crosvm/sys.rs @@ -2,11 +2,18 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#[cfg(unix)] +pub(crate) mod unix; + +#[cfg(windows)] +pub(crate) mod windows; + cfg_if::cfg_if! { if #[cfg(unix)] { - pub(crate) mod unix; use unix as platform; pub(crate) use unix::*; + } else if #[cfg(windows)] { + use windows as platform; } else { compile_error!("Unsupported platform"); } @@ -15,5 +22,8 @@ cfg_if::cfg_if! { pub(crate) use platform::cmdline; pub(crate) use platform::config; +#[cfg(feature = "crash-report")] +pub(crate) use platform::broker::setup_emulator_crash_reporting; #[cfg(feature = "gpu")] pub(crate) use platform::config::validate_gpu_config; +pub(crate) use platform::config::HypervisorKind; diff --git a/src/crosvm/sys/unix/config.rs b/src/crosvm/sys/unix/config.rs index b5952da394..19e8a3ef71 100644 --- a/src/crosvm/sys/unix/config.rs +++ b/src/crosvm/sys/unix/config.rs @@ -16,13 +16,27 @@ use crate::crosvm::config::{invalid_value_err, Config}; #[cfg(feature = "gpu")] use crate::crosvm::{argument, argument::parse_hex_or_decimal}; +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub enum HypervisorKind { + Kvm, +} + +impl FromStr for HypervisorKind { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "kvm" => Ok(HypervisorKind::Kvm), + _ => Err("invalid hypervisor backend"), + } + } +} + #[cfg(all(feature = "gpu", feature = "virgl_renderer_next"))] pub fn parse_gpu_render_server_options( s: &str, ) -> Result { - use std::{path::PathBuf, str::FromStr}; - - use crate::crosvm::{config::invalid_value_err, sys::GpuRenderServerParameters}; + use crate::crosvm::sys::GpuRenderServerParameters; let mut path: Option = None; let mut cache_path = None; @@ -65,8 +79,6 @@ pub fn parse_ac97_options( key: &str, #[allow(unused_variables)] value: &str, ) -> Result<(), String> { - use std::{path::PathBuf, str::FromStr}; - match key { #[cfg(feature = "audio_cras")] "client_type" => { @@ -260,8 +272,6 @@ pub fn parse_gpu_options(s: &str) -> Result { use devices::virtio::GpuMode; use rutabaga_gfx::RutabagaWsi; - use crate::crosvm::sys::config::is_gpu_backend_deprecated; - #[cfg(feature = "gfxstream")] let mut vulkan_specified = false; #[cfg(feature = "gfxstream")] diff --git a/src/crosvm/sys/windows.rs b/src/crosvm/sys/windows.rs new file mode 100644 index 0000000000..cabc012745 --- /dev/null +++ b/src/crosvm/sys/windows.rs @@ -0,0 +1,10 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +pub mod cmdline; +pub mod config; + +pub(crate) mod broker; +pub(crate) mod exit; +pub(crate) mod stats; diff --git a/src/crosvm/sys/windows/broker.rs b/src/crosvm/sys/windows/broker.rs new file mode 100644 index 0000000000..b8038f37c1 --- /dev/null +++ b/src/crosvm/sys/windows/broker.rs @@ -0,0 +1,1747 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! Contains the multi-process broker for crosvm. This is a work in progress, and some example +//! structs here are dead code. +#![allow(dead_code)] +use crate::crosvm::sys::windows::exit::{ + to_process_type_error, Exit, ExitCode, ExitCodeWrapper, ExitContext, ExitContextAnyhow, +}; +#[cfg(feature = "crash-report")] +use crash_report::CrashReportAttributes; + +use crate::{bail_exit_code, crosvm::sys::config::ProcessType, ensure_exit_code, Config}; +use anyhow::{anyhow, Context, Result}; +use base::named_pipes::{self, BlockingMode, FramingMode}; +use base::{ + error, info, syslog, warn, AsRawDescriptor, Descriptor, DuplicateHandleRequest, + DuplicateHandleResponse, Event, EventToken, RawDescriptor, ReadNotifier, SafeDescriptor, Timer, + Tube, WaitContext, +}; + +use base::enable_high_res_timers; +use broker_ipc::CommonChildStartupArgs; +#[cfg(feature = "process-invariants")] +use broker_ipc::{init_broker_process_invariants, EmulatorProcessInvariants}; +use devices::virtio::vhost::user::device::NetBackendConfig; +#[cfg(feature = "gpu")] +use gpu_display::EventDevice; +use metrics::event_details_proto::{EmulatorChildProcessExitDetails, RecordDetails}; +use metrics::{self, MetricEventType}; +use net_util::slirp::sys::windows::{SlirpStartupConfig, SLIRP_BUFFER_SIZE}; +use std::boxed::Box; +use std::collections::HashMap; +use std::env::current_exe; +use std::ffi::OsStr; +use std::fmt::{self, Debug, Display, Formatter}; +use std::fs::OpenOptions; +use std::os::windows::io::{AsRawHandle, RawHandle}; +use std::path::{Path, PathBuf}; +use std::process::{self, Command}; +use std::time::Duration; +use tube_transporter::{TubeToken, TubeTransferData, TubeTransporter}; +use win_util::get_exit_code_process; +use winapi::shared::winerror::ERROR_ACCESS_DENIED; +use winapi::um::processthreadsapi::TerminateProcess; +#[cfg(feature = "crash-report")] +use {base::generate_uuid, crash_report::product_type}; + +const KILL_CHILD_EXIT_CODE: u32 = 1; + +/// With the GPU case, only the backend needs the event devices (input device source end), so +/// we have two structs. This one is sent to the backend, and the other goes to the main process. +#[cfg(feature = "gpu")] +struct GpuDeviceBackend { + bootstrap_tube: Tube, + vhost_user: Tube, + event_devices: Vec, +} + +/// Main process end for a GPU device. +#[cfg(feature = "gpu")] +struct GpuDeviceVMM { + bootstrap_tube: Tube, + vhost_user: Tube, +} + +/// Example of the function that would be in linux.rs. +#[cfg(feature = "gpu")] +fn platform_create_gpus(_cfg: Config) -> Vec<(GpuDeviceBackend, GpuDeviceVMM)> { + unimplemented!() +} + +/// This struct represents a configured "disk" device as returned by the platform's API. There will +/// be two instances of it for each disk device, with the Tubes connected appropriately. The broker +/// will send one of these to the main process, and the other to the vhost user disk backend. +struct DiskDeviceEnd { + bootstrap_tube: Tube, + vhost_user: Tube, +} + +/// Example of the function that would be in linux.rs. +fn platform_create_disks(_cfg: Config) -> Vec<(DiskDeviceEnd, DiskDeviceEnd)> { + unimplemented!() +} + +/// Time to wait after a process failure for the remaining processes to exit. When exceeded, all +/// remaining processes, except metrics, will be terminated. +const EXIT_TIMEOUT: Duration = Duration::from_secs(3); +/// Time to wait for the metrics process to flush and upload all logs. +const METRICS_TIMEOUT: Duration = Duration::from_secs(3); + +/// Maps a process type to its sandbox policy configuration. +fn process_policy(process_type: ProcessType, cfg: &Config) -> sandbox::policy::Policy { + #[allow(unused_mut)] + let mut policy = match process_type { + ProcessType::Block => sandbox::policy::BLOCK, + ProcessType::Main => main_process_policy(cfg), + ProcessType::Metrics => sandbox::policy::METRICS, + ProcessType::Net => sandbox::policy::NET, + ProcessType::Slirp => sandbox::policy::SLIRP, + }; + #[cfg(feature = "asan")] + adjust_asan_policy(&mut policy); + #[cfg(feature = "cperfetto")] + adjust_perfetto_policy(&mut policy); + policy +} + +/// Dynamically appends rules to the main process's policy. +fn main_process_policy(cfg: &Config) -> sandbox::policy::Policy { + let mut policy = sandbox::policy::MAIN; + if let Some(host_guid) = &cfg.host_guid { + let rule = sandbox::policy::Rule { + subsystem: sandbox::SubSystem::SUBSYS_FILES, + semantics: sandbox::Semantics::FILES_ALLOW_ANY, + pattern: format!("\\??\\pipe\\{}\\vsock-*", host_guid), + }; + policy.exceptions.push(rule); + } + let blocked_dlls = vec![ + "NahimicOSD.dll", + "XSplitGameSource64.dll", + "TwitchNativeOverlay64.dll", + "GridWndHook.dll", + ]; + for dll in blocked_dlls.iter() { + policy.dll_blocklist.push(dll.to_string()); + } + policy +} + +/// Adjust a policy to allow ASAN builds to write output files. +fn adjust_asan_policy(policy: &mut sandbox::policy::Policy) { + if (policy.initial_token_level as i32) < (sandbox::TokenLevel::USER_RESTRICTED_NON_ADMIN as i32) + { + policy.initial_token_level = sandbox::TokenLevel::USER_RESTRICTED_NON_ADMIN; + } + if (policy.integrity_level as i32) > (sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM as i32) { + policy.integrity_level = sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM; + } +} + +/// Adjust a policy to allow perfetto tracing to open shared memory and use WASAPI. +fn adjust_perfetto_policy(policy: &mut sandbox::policy::Policy) { + if (policy.initial_token_level as i32) + < (sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS as i32) + { + policy.initial_token_level = sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS; + } + + if (policy.lockdown_token_level as i32) + < (sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS as i32) + { + policy.lockdown_token_level = sandbox::TokenLevel::USER_RESTRICTED_SAME_ACCESS; + } + + if (policy.integrity_level as i32) > (sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM as i32) { + policy.integrity_level = sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM; + } + + if (policy.delayed_integrity_level as i32) + > (sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM as i32) + { + policy.delayed_integrity_level = sandbox::IntegrityLevel::INTEGRITY_LEVEL_MEDIUM; + } +} + +/// Wrapper that terminates a child process (if running) when dropped. +struct ChildCleanup { + process_type: ProcessType, + child: Box, + dh_tube: Option, +} + +#[derive(Debug)] +struct UnsandboxedChild(process::Child); +#[derive(Debug)] +struct SandboxedChild(SafeDescriptor); + +impl AsRawDescriptor for UnsandboxedChild { + fn as_raw_descriptor(&self) -> RawDescriptor { + self.0.as_raw_handle() + } +} + +impl AsRawDescriptor for SandboxedChild { + fn as_raw_descriptor(&self) -> RawDescriptor { + self.0.as_raw_descriptor() + } +} + +impl Display for ChildCleanup { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{:?} {:?}", self.process_type, self.child) + } +} + +trait Child: std::fmt::Debug + AsRawDescriptor { + fn wait(&mut self) -> std::io::Result>; + fn try_wait(&mut self) -> std::io::Result>; + fn kill(&mut self) -> std::io::Result<()>; + // Necessary to upcast dyn Child to dyn AsRawDescriptor + fn as_descriptor(&self) -> &dyn AsRawDescriptor; +} + +impl Child for UnsandboxedChild { + fn wait(&mut self) -> std::io::Result> { + Ok(self.0.wait()?.code()) + } + + fn try_wait(&mut self) -> std::io::Result> { + if let Some(status) = self.0.try_wait()? { + Ok(status.code()) + } else { + Ok(None) + } + } + + fn kill(&mut self) -> std::io::Result<()> { + self.0.kill() + } + + fn as_descriptor(&self) -> &dyn AsRawDescriptor { + self + } +} + +impl Child for SandboxedChild { + fn wait(&mut self) -> std::io::Result> { + let wait_ctx = WaitContext::::new()?; + wait_ctx.add(&self.0, 0)?; + let _events = wait_ctx.wait()?; + self.try_wait() + } + + fn try_wait(&mut self) -> std::io::Result> { + get_exit_code_process(self.0.as_raw_descriptor()).map(|code| code.map(|c| c as i32)) + } + + fn kill(&mut self) -> std::io::Result<()> { + if unsafe { TerminateProcess(self.0.as_raw_descriptor(), KILL_CHILD_EXIT_CODE) == 0 } { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } + } + + fn as_descriptor(&self) -> &dyn AsRawDescriptor { + self + } +} + +impl Drop for ChildCleanup { + fn drop(&mut self) { + let kill_process = match self.child.try_wait() { + Ok(None) => true, + Ok(_) => false, + Err(_) => true, + }; + if kill_process { + if let Err(e) = self.child.kill() { + const ACCESS_DENIED: Option = Some(ERROR_ACCESS_DENIED as i32); + if !matches!(e.raw_os_error(), ACCESS_DENIED) { + error!("Failed to clean up child process {}: {}", self, e); + } + } + + // Sending a kill signal does NOT imply the process has exited. Wait for it to exit. + let wait_res = self.child.wait(); + if let Ok(Some(code)) = wait_res.as_ref() { + warn!( + "child process {} killed, exited {}", + self, + ExitCodeWrapper(*code) + ); + } else { + error!( + "failed to wait for child process {} that was terminated: {:?}", + self, wait_res + ); + } + } else { + info!("child process {} already terminated", self); + } + + // Log child exit code regardless of whether we killed it or it exited + // on its own. + { + // Don't even attempt to log metrics process, it doesn't exist to log + // itself. + if self.process_type != ProcessType::Metrics { + let exit_code = self.child.wait(); + if exit_code.is_ok() && exit_code.as_ref().unwrap().is_some() { + let mut details = RecordDetails::new(); + let mut exit_details = EmulatorChildProcessExitDetails::new(); + exit_details.set_exit_code(exit_code.unwrap().unwrap() as u32); + exit_details.set_process_type(self.process_type.into()); + details.set_emulator_child_process_exit_details(exit_details); + metrics::log_event_with_details(MetricEventType::ChildProcessExit, &details); + } else { + error!( + "Failed to log exit code for process: {:?}, couldn't get exit code", + self.process_type + ); + } + } + } + } +} + +/// Represents a child process spawned by the broker. +struct ChildProcess { + // This is unused, but we hold it open to avoid an EPIPE in the child if it doesn't + // immediately read its startup information. We don't use FlushFileBuffers to avoid this because + // that would require blocking the startup sequence. + tube_transporter: TubeTransporter, + + // Used to set up the child process. Unused in steady state. + bootstrap_tube: Tube, + // Child process PID. + process_id: u32, + alias_pid: u32, +} + +/// Wrapper to start the broker. +pub fn run(cfg: Config) -> Result<()> { + // This wrapper exists because errors that are returned up to the caller aren't logged, though + // they are used to generate the return code. For practical debugging though, we want to log the + // errors. + let res = run_internal(cfg); + if let Err(e) = &res { + error!("Broker encountered an error: {}", e); + } + res +} + +#[derive(EventToken)] +enum Token { + Sigterm, + Process(u32), + MainExitTimeout, + DeviceExitTimeout, + MetricsExitTimeout, + SigtermTimeout, + DuplicateHandle(u32), +} + +fn get_log_path(cfg: &Config, file_name: &str) -> Option { + match cfg.logs_directory.as_ref() { + Some(dir) => Some(Path::new(dir).join(file_name)), + None => None, + } +} + +/// Creates a metrics tube pair for communication with the metrics process. +/// The returned Tube will be used by the process producing logs, while +/// the metric_tubes list is sent to the metrics process to receive logs. +/// +/// IMPORTANT NOTE: The metrics process must receive the client (second) end +/// of the Tube pair in order to allow the connection to be properly shut +/// down without data loss. +fn metrics_tube_pair(metric_tubes: &mut Vec) -> Result { + // TODO(nkgold): as written, this Tube pair won't handle ancillary data properly because the + // PIDs are not set properly at each end; however, we don't plan to send ancillary data. + let (t1, t2) = Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + metric_tubes.push(t2); + Ok(t1) +} + +#[cfg(feature = "crash-report")] +pub fn create_crash_report_attrs(cfg: &Config, product_type: &str) -> CrashReportAttributes { + crash_report::CrashReportAttributes { + product_type: product_type.to_owned(), + pipe_name: cfg.crash_pipe_name.clone(), + report_uuid: cfg.crash_report_uuid.clone(), + product_name: cfg.product_name.clone(), + product_version: cfg.product_version.clone(), + } +} + +/// Setup crash reporting for a process. Each process MUST provide a unique `product_type` to avoid +/// making crash reports incomprehensible. +#[cfg(feature = "crash-report")] +pub fn setup_emulator_crash_reporting(cfg: &Config) -> Result { + crash_report::setup_crash_reporting(create_crash_report_attrs( + cfg, + crash_report::product_type::EMULATOR, + )) + .exit_context( + Exit::CrashReportingInit, + "failed to initialize crash reporting", + ) +} + +/// Starts the broker, which in turn spawns the main process & vhost user devices. +/// General data flow for device & main process spawning: +/// Each platform (e.g. linux.rs) will provide create_inputs/gpus/nets. +/// +/// Those functions will return a list of pairs of structs (containing the pipes and other +/// process specific configuration) for the VMM & backend sides of the device. These structs +/// should be minimal, and not duplicate information that is otherwise available in the Config +/// struct. There MAY be two different types per device, one for the VMM side, and another for +/// the backend. +/// +/// The broker will send all the VMM structs to the main process, and the other structs +/// to the vhost user backends. Every process will get a copy of the Config struct. +/// +/// Finally, the broker will wait on the child processes to exit, and handle errors. +/// +/// Refrain from using platform specific code within this function. It will eventually be cross +/// platform. +fn run_internal(mut cfg: Config) -> Result<()> { + if sandbox::is_sandbox_broker() { + // Get the BrokerServices pointer so that it gets initialized. + sandbox::BrokerServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + // Note that parsing args causes syslog's log file to be set to the log file for the "main" + // process. We don't want broker logs going there, so we fetch our own log file and set it here. + let mut log_cfg = syslog::LogConfig::default(); + if let Some(log_path) = get_log_path(&cfg, "broker_syslog.log") { + log_cfg.pipe = Some(Box::new( + OpenOptions::new() + .append(true) + .create(true) + .open(log_path.as_path()) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", log_path.display()) + })?, + )); + log_cfg.stderr = false; + } else { + log_cfg.stderr = true; + } + syslog::init_with(log_cfg)?; + + #[cfg(feature = "process-invariants")] + let process_invariants = init_broker_process_invariants( + &cfg.process_invariants_data_handle, + &cfg.process_invariants_data_size, + ) + .exit_context( + Exit::ProcessInvariantsInit, + "failed to initialize process invariants", + )?; + + #[cfg(feature = "crash-report")] + init_broker_crash_reporting(&mut cfg)?; + + let _raise_timer_resolution = enable_high_res_timers() + .exit_context(Exit::EnableHighResTimer, "failed to enable high res timers")?; + + // Note: in case of an error / scope exit, any children still in this map will be automatically + // closed. + let mut children: HashMap = HashMap::new(); + + let mut exit_events = Vec::new(); + let mut wait_ctx: WaitContext = WaitContext::new() + .exit_context(Exit::CreateWaitContext, "failed to create event context")?; + + // Hook ^C / SIGTERM so we can handle it gracefully. + let sigterm_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + let sigterm_event_ctrlc = sigterm_event + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?; + ctrlc::set_handler(move || { + sigterm_event_ctrlc.write(0).unwrap(); + }) + .exit_context(Exit::SetSigintHandler, "failed to set sigint handler")?; + wait_ctx.add(&sigterm_event, Token::Sigterm).exit_context( + Exit::WaitContextAdd, + "failed to add trigger to event context", + )?; + + let mut metric_tubes = Vec::new(); + let metrics_controller = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["run-metrics"], + get_log_path(&cfg, "metrics_stdout.log"), + get_log_path(&cfg, "metrics_stderr.log"), + ProcessType::Metrics, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ + cfg.jail_config.is_some(), + Vec::new(), + &cfg, + )?; + metrics_controller + .tube_transporter + .serialize_and_transport(metrics_controller.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + let mut main_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["run-main"], + get_log_path(&cfg, "main_stdout.log"), + get_log_path(&cfg, "main_stderr.log"), + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ + cfg.jail_config.is_some(), + Vec::new(), + &cfg, + )?; + + // Save block children `ChildProcess` so TubeTransporter and Tubes don't get closed. + let _block_children = start_up_block_backends( + &mut cfg, + &mut children, + &mut exit_events, + &mut wait_ctx, + &mut main_child, + &mut metric_tubes, + #[cfg(feature = "process-invariants")] + &process_invariants, + )?; + + let (_slirp_child, _net_children) = start_up_net_backend( + &mut main_child, + &mut children, + &mut exit_events, + &mut wait_ctx, + &mut cfg, + &mut metric_tubes, + #[cfg(feature = "process-invariants")] + &process_invariants, + )?; + + // Wait until all device processes are spun up so main TubeTransporter will have all the + // device control and Vhost tubes. + main_child + .tube_transporter + .serialize_and_transport(main_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + main_child.bootstrap_tube.send(&cfg).unwrap(); + + let main_startup_args = CommonChildStartupArgs::new( + get_log_path(&cfg, "main_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(&cfg, product_type::EMULATOR), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(&mut metric_tubes)?), + )?; + main_child.bootstrap_tube.send(&main_startup_args).unwrap(); + + let exit_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + main_child.bootstrap_tube.send(&exit_event).unwrap(); + exit_events.push(exit_event); + + // Setup our own metrics agent + { + let broker_metrics = metrics_tube_pair(&mut metric_tubes)?; + metrics::initialize(broker_metrics); + #[cfg(feature = "kiwi")] + { + let use_vulkan = if cfg!(feature = "gpu") { + match &cfg.gpu_parameters { + Some(params) => Some(params.use_vulkan), + None => { + warn!("No GPU parameters set on CrosVM config."); + None + } + } + } else { + None + }; + anti_tamper::setup_common_metric_invariants( + &&cfg.product_version, + &cfg.product_channel, + &use_vulkan, + ); + } + } + + // We have all the metrics tubes from other children, so give them to the metrics controller + // along with a startup configuration. + let metrics_startup_args = CommonChildStartupArgs::new( + get_log_path(&cfg, "metrics_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(&cfg, product_type::METRICS), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + None, + )?; + metrics_controller + .bootstrap_tube + .send(&metrics_startup_args) + .unwrap(); + + metrics_controller + .bootstrap_tube + .send(&metric_tubes) + .unwrap(); + + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) +} + +/// Shuts down the metrics process, waiting for it to close to ensure +/// all logs are flushed. +fn clean_up_metrics(metrics_child: ChildCleanup) -> Result<()> { + // This will close the final metrics connection, triggering a metrics + // process shutdown. + metrics::get_destructor().cleanup(); + + // However, we still want to wait for the metrics process to finish + // flushing any pending logs before exiting. + let metrics_cleanup_wait = WaitContext::::new().exit_context( + Exit::CreateWaitContext, + "failed to create metrics wait context", + )?; + let mut metrics_timeout = + Timer::new().exit_context(Exit::CreateTimer, "failed to create metrics timeout timer")?; + metrics_timeout + .reset(EXIT_TIMEOUT, None) + .exit_context(Exit::ResetTimer, "failed to reset timer")?; + metrics_cleanup_wait.add(&metrics_timeout, 0).exit_context( + Exit::WaitContextAdd, + "failed to add metrics timout to wait context", + )?; + metrics_cleanup_wait + .add(metrics_child.child.as_descriptor(), 1) + .exit_context( + Exit::WaitContextAdd, + "failed to add metrics process to wait context", + )?; + let events = metrics_cleanup_wait + .wait() + .context("failed to wait for metrics context")?; + + let mut process_exited = false; + if events + .iter() + .find(|e| e.is_readable && e.token == 1) + .is_some() + { + process_exited = true; + } + + if !process_exited { + warn!( + "broker: Metrics process timed out before cleanly exiting. + This may indicate some logs remain unsent." + ); + // Process will be force-killed on drop + } + + Ok(()) +} + +#[cfg(feature = "crash-report")] +fn init_broker_crash_reporting(cfg: &mut Config) -> Result<()> { + cfg.crash_report_uuid = Some(generate_uuid()); + if cfg.crash_pipe_name.is_none() { + // We weren't started by the service. Spin up a crash reporter to be shared with all + // children. + cfg.crash_pipe_name = Some( + crash_report::setup_crash_reporting(create_crash_report_attrs( + &cfg, + product_type::BROKER, + )) + .exit_context(Exit::CrashReportingInit, "failed to init crash reporting")?, + ); + } else { + crash_report::setup_crash_reporting(create_crash_report_attrs(&cfg, product_type::BROKER)) + .exit_context(Exit::CrashReportingInit, "failed to init crash reporting")?; + } + + Ok(()) +} + +struct Supervisor { + children: HashMap, + wait_ctx: WaitContext, + exit_events: Vec, + exit_timer: Option, +} + +impl Supervisor { + pub fn broker_supervise_loop( + children: HashMap, + wait_ctx: WaitContext, + exit_events: Vec, + ) -> Result<()> { + let mut supervisor = Supervisor { + children, + wait_ctx, + exit_events, + exit_timer: None, + }; + let result = supervisor.broker_loop(); + + // Once supervise loop exits, we are exiting and just need to clean + // up. In error cases, there could still be children processes, so we close + // those first, and finally drop the metrics process. + supervisor.children.retain(|_, child| { + match child.process_type { + ProcessType::Metrics => true, + _ => { + warn!( + "broker: Forcibly closing child (type: {:?}). This often means + the child was unable to close within the normal timeout window, + or the broker itself failed with an error.", + child.process_type + ); + // Child killed on drop + false + } + } + }); + + { + if supervisor.is_only_metrics_process_running() { + clean_up_metrics(supervisor.children.into_values().next().unwrap())?; + } else { + warn!( + "broker: Metrics process not running after cleanup. + This may indicate some exit logs have been dropped." + ); + } + } + + result + } + + /// We require exactly one main process. + fn assert_children_sane(&mut self) { + let main_processes = self + .children + .iter() + .filter(|(_, child)| child.process_type == ProcessType::Main) + .count(); + if main_processes != 1 { + // Why do we have to clear children? Well, panic *can* cause destructors not to run, + // which means these children won't run. The exact explanation for this isn't clear, but + // it reproduced consistently. So since we're panicking, we'll be careful. + self.children.clear(); + panic!( + "Broker must supervise exactly one main process. Got {} main process(es).", + main_processes, + ) + } + } + + fn is_only_metrics_process_running(&self) -> bool { + self.children.len() == 1 + && self.children.values().next().unwrap().process_type == ProcessType::Metrics + } + + fn all_non_metrics_processes_exited(&self) -> bool { + #[cfg(not(feature = "kiwi"))] + return self.children.len() == 0; + #[cfg(feature = "kiwi")] + return self.children.len() == 0 || self.is_only_metrics_process_running(); + } + + fn start_exit_timer(&mut self, timeout_token: Token) -> Result<()> { + if self.exit_timer.is_some() { + return Ok(()); + } + + let mut et = Timer::new().exit_context(Exit::CreateTimer, "failed to create timer")?; + et.reset(EXIT_TIMEOUT, None) + .exit_context(Exit::ResetTimer, "failed to reset timer")?; + self.wait_ctx.add(&et, timeout_token).exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + self.exit_timer = Some(et); + + Ok(()) + } + + /// Once children have been spawned, this function is called to run the supervision loop, which + /// waits for processes to exit and handles errors. + fn broker_loop(&mut self) -> Result<()> { + const KILLED_BY_SIGNAL: ExitCode = Exit::KilledBySignal as ExitCode; + self.assert_children_sane(); + let mut first_nonzero_exitcode = None; + + while !self.all_non_metrics_processes_exited() { + let events = self + .wait_ctx + .wait() + .context("failed to wait for event context")?; + + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + Token::Sigterm => { + // Signal all children other than metrics to exit. + for exit_event in &self.exit_events { + if let Err(e) = exit_event.write(1) { + error!("failed to signal exit event to child: {}", e); + } + } + first_nonzero_exitcode.get_or_insert(KILLED_BY_SIGNAL); + self.start_exit_timer(Token::SigtermTimeout)?; + } + Token::Process(child_id) => { + let mut child = self.children.remove(&child_id).unwrap(); + let process_handle = Descriptor(child.child.as_raw_descriptor()); + self.wait_ctx.delete(&process_handle).exit_context( + Exit::WaitContextDelete, + "failed to remove trigger from event context", + )?; + if let Some(dh_tube) = child.dh_tube.as_ref() { + self.wait_ctx + .delete(dh_tube.get_read_notifier()) + .exit_context( + Exit::WaitContextDelete, + "failed to remove trigger from event context", + )?; + } + + let exit_code = child.child.wait().unwrap().unwrap(); + info!( + "broker: child (type {:?}) exited {}", + child.process_type, + ExitCodeWrapper(exit_code), + ); + + // Save the child's exit code (to pass through to the broker's exit code) if + // none has been saved or if the previously saved exit code was + // KilledBySignal. We overwrite KilledBySignal because the child exit may + // race with the sigterm from the service, esp if child exit is slowed by a Crashpad + // dump, and we don't want to lose the child's exit code if it was the + // initial cause of the emulator failing. + if exit_code != 0 + && (first_nonzero_exitcode.is_none() + || matches!(first_nonzero_exitcode, Some(KILLED_BY_SIGNAL))) + { + info!( + "setting first_nonzero_exitcode {:?} -> {}", + first_nonzero_exitcode, exit_code, + ); + first_nonzero_exitcode = + Some(to_process_type_error(exit_code as u32, child.process_type) + as i32); + } + + let timeout_token = match child.process_type { + ProcessType::Main => Token::MainExitTimeout, + ProcessType::Metrics => Token::MetricsExitTimeout, + _ => Token::DeviceExitTimeout, + }; + self.start_exit_timer(timeout_token)?; + } + Token::SigtermTimeout => { + if let Some(exit_code) = first_nonzero_exitcode { + if exit_code != KILLED_BY_SIGNAL { + bail_exit_code!( + exit_code, + "broker got sigterm, but a child exited with an error.", + ); + } + } + ensure_exit_code!( + self.all_non_metrics_processes_exited(), + Exit::BrokerSigtermTimeout, + "broker got sigterm, but other broker children did not exit within the \ + timeout", + ); + } + Token::MainExitTimeout => { + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + "main exited, but a child exited with an error.", + ); + } + ensure_exit_code!( + self.all_non_metrics_processes_exited(), + Exit::BrokerMainExitedTimeout, + "main exited, but other broker children did not exit within the \ + timeout", + ); + } + Token::DeviceExitTimeout => { + // A device process exited, but there are still other processes running. + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + "a device exited, and either it or another child exited with an \ + error.", + ); + } + ensure_exit_code!( + self.all_non_metrics_processes_exited(), + Exit::BrokerDeviceExitedTimeout, + "device exited, but other broker children did not exit within the \ + timeout", + ); + } + Token::MetricsExitTimeout => { + // The metrics server exited, but there are still other processes running. + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + "metrics server exited, and either it or another child exited with \ + an error.", + ); + } + ensure_exit_code!( + self.children.len() == 0, + Exit::BrokerMetricsExitedTimeout, + "metrics exited, but other broker children did not exit within the \ + timeout", + ); + } + Token::DuplicateHandle(child_id) => { + if let Some(tube) = &self.children[&child_id].dh_tube { + let req: DuplicateHandleRequest = tube + .recv() + .exit_context(Exit::TubeFailure, "failed operation on tube")?; + if !self.children.contains_key(&req.target_alias_pid) { + error!( + "DuplicateHandleRequest contained invalid alias pid: {}", + req.target_alias_pid + ); + tube.send(&DuplicateHandleResponse { handle: None }) + .exit_context(Exit::TubeFailure, "failed operation on tube")?; + } else { + let target = &self.children[&req.target_alias_pid].child; + let handle = win_util::duplicate_handle_from_source_process( + self.children[&child_id].child.as_raw_descriptor(), + req.handle as RawHandle, + target.as_raw_descriptor(), + ); + match handle { + Ok(handle) => tube + .send(&DuplicateHandleResponse { + handle: Some(handle as usize), + }) + .exit_context( + Exit::TubeFailure, + "failed operation on tube", + )?, + Err(e) => { + error!("Failed to duplicate handle: {}", e); + tube.send(&DuplicateHandleResponse { handle: None }) + .exit_context( + Exit::TubeFailure, + "failed operation on tube", + )? + } + }; + } + } + } + } + } + } + + if let Some(exit_code) = first_nonzero_exitcode { + bail_exit_code!( + exit_code, + if exit_code == KILLED_BY_SIGNAL { + "broker got sigterm, and all children exited zero from shutdown event." + } else { + "all processes exited, but at least one encountered an error." + }, + ); + } + + Ok(()) + } +} + +fn start_up_block_backends( + cfg: &mut Config, + children: &mut HashMap, + exit_events: &mut Vec, + wait_ctx: &mut WaitContext, + main_child: &mut ChildProcess, + metric_tubes: &mut Vec, + #[cfg(feature = "process-invariants")] process_invariants: &EmulatorProcessInvariants, +) -> Result> { + let mut block_children = Vec::new(); + let disk_options = cfg.disks.clone(); + for (index, disk_option) in disk_options.iter().enumerate() { + let block_child = spawn_block_backend(index, main_child, children, wait_ctx, cfg)?; + + let startup_args = CommonChildStartupArgs::new( + get_log_path(cfg, &format!("disk_{}_syslog.log", index)), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(cfg, &format!("{}_{}", product_type::DISK, index)), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(metric_tubes)?), + )?; + block_child.bootstrap_tube.send(&startup_args).unwrap(); + + block_child.bootstrap_tube.send(&disk_option).unwrap(); + + let exit_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + block_child.bootstrap_tube.send(&exit_event).unwrap(); + exit_events.push(exit_event); + block_children.push(block_child); + } + + Ok(block_children) +} + +fn spawn_block_backend( + log_index: usize, + main_child: &mut ChildProcess, + children: &mut HashMap, + wait_ctx: &mut WaitContext, + cfg: &mut Config, +) -> Result { + let (mut vhost_user_main_tube, mut vhost_user_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + let (mut disk_host_tube, mut disk_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + disk_device_tube.set_target_pid(main_child.alias_pid); + vhost_user_device_tube.set_target_pid(main_child.alias_pid); + let block_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["device", "block"], + get_log_path(&cfg, &format!("disk_{}_stdout.log", log_index)), + get_log_path(&cfg, &format!("disk_{}_stderr.log", log_index)), + ProcessType::Block, + children, + wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ + cfg.jail_config.is_some(), + vec![ + TubeTransferData { + tube: disk_device_tube, + tube_token: TubeToken::Control, + }, + TubeTransferData { + tube: vhost_user_device_tube, + tube_token: TubeToken::VhostUser, + }, + ], + cfg, + )?; + + block_child + .tube_transporter + .serialize_and_transport(block_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + vhost_user_main_tube.set_target_pid(block_child.alias_pid); + disk_host_tube.set_target_pid(block_child.alias_pid); + cfg.block_control_tube.push(disk_host_tube); + cfg.block_vhost_user_tube.push(vhost_user_main_tube); + + Ok(block_child) +} + +fn spawn_sandboxed_child( + program: &str, + args: I, + stdout_file: Option, + stderr_file: Option, + handles_to_inherit: Vec<&dyn AsRawDescriptor>, + process_policy: sandbox::policy::Policy, +) -> Result<(u32, Box)> +where + I: IntoIterator, + S: AsRef, +{ + let mut broker = sandbox::BrokerServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")? + .unwrap(); + let mut policy = broker.create_policy(); + policy + .set_token_level( + process_policy.initial_token_level, + process_policy.lockdown_token_level, + ) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + policy + .set_job_level(process_policy.job_level, 0) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + policy + .set_integrity_level(process_policy.integrity_level) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + policy + .set_delayed_integrity_level(process_policy.delayed_integrity_level) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + + if process_policy.alternate_desktop { + policy + .set_alternate_desktop(process_policy.alternate_winstation) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + for rule in process_policy.exceptions { + policy + .add_rule(rule.subsystem, rule.semantics, rule.pattern) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + policy.set_lockdown_default_dacl(); + + if let Some(file) = stdout_file.as_ref() { + policy + .set_stdout_from_file(file) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + if let Some(file) = stderr_file.as_ref() { + policy + .set_stderr_from_file(file) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + for handle in handles_to_inherit.into_iter() { + policy.add_handle_to_share(handle); + } + + for dll in process_policy.dll_blocklist.into_iter() { + policy + .add_dll_to_unload(&dll) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + + // spawn_target uses CreateProcessW to create a new process, which will pass + // the command line arguments verbatim to the new process. Most processes + // expect that argv[0] will be the program name, so provide that before the + // rest of the args. + let command_line = args + .into_iter() + .fold(format!("\"{}\"", program), |mut args, arg| { + args.push(' '); + args.push_str(OsStr::new(&arg).to_str().unwrap()); + args + }); + + let (target, warning) = broker + .spawn_target(program, &command_line, &policy) + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + if let Some(w) = warning { + warn!("sandbox: got warning spawning target: {}", w); + } + win_util::resume_thread(target.thread.as_raw_descriptor()) + .exit_context(Exit::ProcessSpawnFailed, "failed to spawn child process")?; + + Ok((target.process_id, Box::new(SandboxedChild(target.process)))) +} + +fn spawn_unsandboxed_child( + program: &str, + args: I, + stdout_file: Option, + stderr_file: Option, + handles_to_inherit: Vec<&dyn AsRawDescriptor>, +) -> Result<(u32, Box)> +where + I: IntoIterator, + S: AsRef, +{ + let mut proc = Command::new(program); + + let proc = proc.args(args); + + for handle in handles_to_inherit.iter() { + win_util::set_handle_inheritance(handle.as_raw_descriptor(), /* inheritable= */ true) + .exit_context(Exit::CreateSocket, "failed to create socket")?; + } + + if let Some(file) = stdout_file { + proc.stdout(file); + } + + if let Some(file) = stderr_file { + proc.stderr(file); + } + + info!("spawning process: {:?}", proc); + let proc = proc + .spawn() + .exit_context(Exit::ProcessSpawnFailed, "failed to spawn child process")?; + + for handle in handles_to_inherit.iter() { + win_util::set_handle_inheritance(handle.as_raw_descriptor(), /* inheritable= */ false) + .exit_context(Exit::CreateSocket, "failed to create socket")?; + } + + let process_id = proc.id(); + + Ok((process_id, Box::new(UnsandboxedChild(proc)))) +} + +fn start_up_net_backend( + main_child: &mut ChildProcess, + children: &mut HashMap, + exit_events: &mut Vec, + wait_ctx: &mut WaitContext, + cfg: &mut Config, + metric_tubes: &mut Vec, + #[cfg(feature = "process-invariants")] process_invariants: &EmulatorProcessInvariants, +) -> Result<(ChildProcess, ChildProcess)> { + let (host_pipe, guest_pipe) = named_pipes::pair_with_buffer_size( + &FramingMode::Message, + &BlockingMode::Wait, + /* timeout= */ 0, + /* buffer_size= */ SLIRP_BUFFER_SIZE, + /* overlapped= */ true, + ) + .expect("Failed to create named pipe pair."); + let slirp_kill_event = Event::new().expect("Failed to create slirp kill event."); + + let slirp_child = spawn_slirp(children, wait_ctx, cfg)?; + + let slirp_child_startup_args = CommonChildStartupArgs::new( + get_log_path(cfg, "slirp_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(cfg, product_type::SLIRP), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(metric_tubes)?), + )?; + slirp_child + .bootstrap_tube + .send(&slirp_child_startup_args) + .unwrap(); + + let slirp_config = SlirpStartupConfig { + slirp_pipe: host_pipe, + shutdown_event: slirp_kill_event + .try_clone() + .expect("Failed to clone slirp kill event."), + #[cfg(feature = "slirp-ring-capture")] + slirp_capture_file: cfg.slirp_capture_file.take(), + }; + slirp_child.bootstrap_tube.send(&slirp_config).unwrap(); + + let net_child = spawn_net_backend(main_child, children, wait_ctx, cfg)?; + + let net_child_startup_args = CommonChildStartupArgs::new( + get_log_path(cfg, "net_syslog.log"), + #[cfg(feature = "crash-report")] + create_crash_report_attrs(cfg, product_type::SLIRP), + #[cfg(feature = "process-invariants")] + process_invariants.clone(), + Some(metrics_tube_pair(metric_tubes)?), + )?; + net_child + .bootstrap_tube + .send(&net_child_startup_args) + .unwrap(); + + let net_backend_config = NetBackendConfig { + guest_pipe, + slirp_kill_event, + }; + net_child.bootstrap_tube.send(&net_backend_config).unwrap(); + let exit_event = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + net_child.bootstrap_tube.send(&exit_event).unwrap(); + exit_events.push(exit_event); + + Ok((slirp_child, net_child)) +} + +fn spawn_slirp( + children: &mut HashMap, + wait_ctx: &mut WaitContext, + cfg: &mut Config, +) -> Result { + let slirp_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["run-slirp"], + get_log_path(&cfg, "slirp_stdout.log"), + get_log_path(&cfg, "slirp_stderr.log"), + ProcessType::Slirp, + children, + wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ cfg.jail_config.is_some(), + vec![], + cfg, + )?; + + slirp_child + .tube_transporter + .serialize_and_transport(slirp_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + Ok(slirp_child) +} + +fn spawn_net_backend( + main_child: &mut ChildProcess, + children: &mut HashMap, + wait_ctx: &mut WaitContext, + cfg: &mut Config, +) -> Result { + let (mut vhost_user_main_tube, mut vhost_user_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + vhost_user_device_tube.set_target_pid(main_child.alias_pid); + + let net_child = spawn_child( + current_exe().unwrap().to_str().unwrap(), + &["device", "net"], + get_log_path(&cfg, "net_stdout.log"), + get_log_path(&cfg, "net_stderr.log"), + ProcessType::Net, + children, + wait_ctx, + /* skip_bootstrap= */ + #[cfg(test)] + false, + /* use_sandbox= */ cfg.jail_config.is_some(), + vec![TubeTransferData { + tube: vhost_user_device_tube, + tube_token: TubeToken::VhostUser, + }], + cfg, + )?; + + net_child + .tube_transporter + .serialize_and_transport(net_child.process_id) + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + vhost_user_main_tube.set_target_pid(net_child.alias_pid); + cfg.net_vhost_user_tube = Some(vhost_user_main_tube); + + Ok(net_child) +} + +/// Spawns a child process, sending it a control tube as the --bootstrap=HANDLE_NUMBER argument. +/// stdout & stderr are redirected to the provided file paths. +fn spawn_child( + program: &str, + args: I, + stdout_path: Option, + stderr_path: Option, + process_type: ProcessType, + children: &mut HashMap, + wait_ctx: &mut WaitContext, + #[cfg(test)] skip_bootstrap: bool, + use_sandbox: bool, + mut tubes: Vec, + #[allow(unused_variables)] cfg: &Config, +) -> Result +where + I: IntoIterator, + S: AsRef, +{ + let (tube_transport_pipe, tube_transport_main_child) = named_pipes::pair( + &FramingMode::Message, + &BlockingMode::Wait, + /* timeout= */ 0, + ) + .exit_context(Exit::CreateSocket, "failed to create socket")?; + + let stdout_file = if let Some(path) = stdout_path { + Some( + OpenOptions::new() + .append(true) + .create(true) + .open(path.as_path()) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", path.display()) + })?, + ) + } else { + None + }; + + let stderr_file = if let Some(path) = stderr_path { + Some( + OpenOptions::new() + .append(true) + .create(true) + .open(path.as_path()) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", path.display()) + })?, + ) + } else { + None + }; + + #[cfg(test)] + let bootstrap = if !skip_bootstrap { + vec![ + "--bootstrap".to_string(), + (tube_transport_main_child.as_raw_descriptor() as usize).to_string(), + ] + } else { + vec![] + }; + #[cfg(not(test))] + let bootstrap = vec![ + "--bootstrap".to_string(), + (tube_transport_main_child.as_raw_descriptor() as usize).to_string(), + ]; + + let input_args: Vec = args.into_iter().collect(); + let args = input_args + .iter() + .map(|arg| arg.as_ref()) + .chain(bootstrap.iter().map(|arg| arg.as_ref())); + + let (process_id, child) = if use_sandbox { + spawn_sandboxed_child( + program, + args, + stdout_file, + stderr_file, + vec![&tube_transport_main_child], + process_policy(process_type, cfg), + )? + } else { + spawn_unsandboxed_child( + program, + args, + stdout_file, + stderr_file, + vec![&tube_transport_main_child], + )? + }; + + let (mut bootstrap_tube, bootstrap_tube_child) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + // Make sure our end of the Tube knows the PID of the child end. + bootstrap_tube.set_target_pid(process_id); + + tubes.push(TubeTransferData { + tube: bootstrap_tube_child, + tube_token: TubeToken::Bootstrap, + }); + + let (dh_tube, dh_tube_child, alias_pid) = if use_sandbox { + let (broker, child) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + (Some(broker), Some(child), rand::random()) + } else { + (None, None, process_id) + }; + + let tube_transporter = + TubeTransporter::new(tube_transport_pipe, tubes, Some(alias_pid), dh_tube_child); + + // Register this child to be waited upon. + let process_handle = Descriptor(child.as_raw_descriptor()); + wait_ctx + .add(&process_handle, Token::Process(alias_pid)) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to event context", + )?; + + children.insert( + alias_pid, + ChildCleanup { + process_type, + child, + dh_tube, + }, + ); + + if use_sandbox { + wait_ctx + .add( + children[&alias_pid] + .dh_tube + .as_ref() + .unwrap() + .get_read_notifier(), + Token::DuplicateHandle(alias_pid), + ) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to event context", + )?; + } + + Ok(ChildProcess { + bootstrap_tube, + tube_transporter, + process_id, + alias_pid, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use base::thread::spawn_with_timeout; + + /// Verifies that the supervisor loop exits normally with a single child that exits. + #[test] + fn smoke_test() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events).unwrap(); + }) + .try_join(Duration::from_secs(5)) + .unwrap(); + } + + /// Verifies that the supervisor loop exits normally when a device exits first, and then + /// the main loop exits. + #[test] + fn main_and_device_clean_exit() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["3"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events).unwrap(); + }) + .try_join(Duration::from_secs(5)) + .unwrap(); + } + + /// Verifies that the supervisor loop ends even if a device takes too long to exit. + #[test] + fn device_takes_too_long_to_exit() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "sleep", + &["10"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) + .to_exit_code() + .unwrap(), + ExitCode::from(Exit::BrokerMainExitedTimeout), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + } + + /// Verifies that the supervisor loop ends even if the main process takes too long to exit. + #[test] + fn main_takes_too_long_to_exit() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["10"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) + .to_exit_code() + .unwrap(), + ExitCode::from(Exit::BrokerDeviceExitedTimeout), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + } + + /// Verifies that the supervisor loop ends even if a device takes too long to exit. + #[test] + fn device_crash_returns_child_error() { + spawn_with_timeout(|| { + let mut children: HashMap = HashMap::new(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let exit_events = vec![Event::new().unwrap()]; + let _child_main = spawn_child( + "sleep", + &["1"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + let _child_device = spawn_child( + "cmd", + &["/c", "exit -1"], + None, + None, + ProcessType::Block, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, exit_events) + .to_exit_code() + .unwrap(), + ExitCode::from(to_process_type_error(-1i32 as u32, ProcessType::Block) as i32), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + } + + /// Verifies that sigterm makes the supervisor loop signal the exit event. + #[test] + fn sigterm_signals_exit_event() { + let exit_event = Event::new().unwrap(); + let exit_event_copy = exit_event.try_clone().unwrap(); + + spawn_with_timeout(move || { + let sigterm_event = Event::new().unwrap(); + let mut wait_ctx: WaitContext = WaitContext::new().unwrap(); + let mut children: HashMap = HashMap::new(); + let _child_main = spawn_child( + "sleep", + &["2"], + None, + None, + ProcessType::Main, + &mut children, + &mut wait_ctx, + /* skip_bootstrap= */ true, + /* use_sandbox= */ false, + Vec::new(), + &Config::default(), + ); + wait_ctx.add(&sigterm_event, Token::Sigterm).unwrap(); + sigterm_event.write(1).unwrap(); + + assert_eq!( + Supervisor::broker_supervise_loop(children, wait_ctx, vec![exit_event_copy]) + .to_exit_code() + .unwrap(), + ExitCode::from(Exit::KilledBySignal), + ); + }) + .try_join(Duration::from_secs(10)) + .unwrap(); + + exit_event.read_timeout(Duration::from_secs(0)).unwrap(); + } +} diff --git a/src/crosvm/sys/windows/cmdline.rs b/src/crosvm/sys/windows/cmdline.rs new file mode 100644 index 0000000000..699336a497 --- /dev/null +++ b/src/crosvm/sys/windows/cmdline.rs @@ -0,0 +1,85 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use argh::FromArgs; + +use argh_helpers::generate_catchall_args; +#[derive(Debug, FromArgs)] +#[argh(subcommand)] +/// Windows Devices +pub enum DevicesSubcommand {} + +#[cfg(feature = "slirp")] +#[generate_catchall_args] +#[argh(subcommand, name = "run-slirp")] +/// Start a new metrics instance +pub struct RunSlirpCommand {} + +#[generate_catchall_args] +#[argh(subcommand, name = "run-main")] +/// Start a new broker instance +pub struct RunMainCommand {} + +#[generate_catchall_args] +#[argh(subcommand, name = "run-metrics")] +/// Start a new metrics instance +pub struct RunMetricsCommand {} + +/// Start a new mp crosvm instance +#[generate_catchall_args] +#[argh(subcommand, name = "run-mp")] +pub struct RunMPCommand {} + +#[derive(FromArgs)] +#[argh(subcommand)] +/// Windows Devices +pub enum Commands { + RunMetrics(RunMetricsCommand), + RunMP(RunMPCommand), + #[cfg(feature = "slirp")] + RunSlirp(RunSlirpCommand), + RunMain(RunMainCommand), +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::crosvm::cmdline::RunCommand; + + fn get_args() -> Vec<&'static str> { + vec!["--bios", "C:\\src\\crosvm\\out\\image\\default\\images\\bios.rom", + "--crash-pipe-name", "\\\\.\\pipe\\crashpad_27812_XGTCCTBYULHHLEJU", "--cpus", "4", + "--mem", "8192", + "--log-file", "C:\\tmp\\Emulator.log", + "--kernel-log-file", "C:\\tmp\\Hypervisor.log", + "--logs-directory", "C:\\tmp\\emulator_logs", + "--serial", "hardware=serial,num=1,type=file,path=C:\\tmp\\AndroidSerial.log,earlycon=true", + "--serial", "hardware=virtio-console,num=1,type=file,path=C:\\tmp\\AndroidSerial.log,console=true", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\aggregate.img", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\metadata.img", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\userdata.img", + "--rwdisk", "C:\\src\\crosvm\\out\\image\\default\\avd\\misc.img", + "--process-invariants-handle", "7368", "--process-invariants-size", "568", + "--gpu", "angle=true,backend=gfxstream,egl=true,gles=false,glx=false,refresh_rate=60,surfaceless=false,vulkan=true,wsi=vk,display_mode=borderless_full_screen,hidden", + "--host-guid", "09205719-879f-4324-8efc-3e362a4096f4", + "--ac97", "backend=win_audio", + "--cid", "3", "--multi-touch", "nil", "--mouse", "nil", "--product-version", "99.9.9.9", + "--product-channel", "Local", "--product-name", "Play Games", + "--service-pipe-name", "service-ipc-8244a83a-ae3f-486f-9c50-3fc47b309d27", + "--pstore", "path=C:\\tmp\\pstore,size=1048576", + "--pvclock", + "--params", "fake args"] + } + + #[test] + fn parse_run_mp_test() { + let _ = RunMPCommand::from_args(&[&"run-mp"], &get_args()).unwrap(); + } + + #[test] + fn parse_run_test() { + let _ = RunCommand::from_args(&[&"run-main"], &get_args()).unwrap(); + } +} diff --git a/src/crosvm/sys/windows/config.rs b/src/crosvm/sys/windows/config.rs new file mode 100644 index 0000000000..91dddb27e0 --- /dev/null +++ b/src/crosvm/sys/windows/config.rs @@ -0,0 +1,822 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::str::FromStr; + +#[cfg(feature = "gpu")] +use base::info; +#[cfg(all(feature = "prod-build", feature = "kiwi"))] +use devices::serial_device::SerialType; +#[cfg(feature = "gpu")] +use devices::virtio::{GpuDisplayMode, GpuDisplayParameters, GpuMode, GpuParameters}; +use devices::Ac97Parameters; +use devices::SerialParameters; +use metrics::event_details_proto::EmulatorProcessType; +#[cfg(feature = "gpu")] +use rutabaga_gfx::{calculate_context_mask, RutabagaWsi}; +use serde::{Deserialize, Serialize}; + +use crate::crosvm::{argument, config::Config}; + +#[cfg(feature = "audio")] +pub fn parse_ac97_options( + _ac97_params: &mut Ac97Parameters, + key: &str, + value: &str, +) -> Result<(), String> { + Err(format!("unknown ac97 parameter {} {}", key, value)) +} + +#[cfg(feature = "audio")] +pub(crate) fn check_ac97_backend( + #[allow(unused_variables)] ac97_params: &Ac97Parameters, +) -> Result<(), String> { + Ok(()) +} + +#[cfg(feature = "gpu")] +pub fn is_gpu_backend_deprecated(backend: &str) -> bool { + match backend { + "2d" | "2D" | "3d" | "3D" | "virglrenderer" => { + cfg!(feature = "gfxstream") + } + _ => false, + } +} + +#[cfg(feature = "gfxstream")] +pub fn use_vulkan() -> bool { + false +} + +pub fn check_serial_params( + #[allow(unused_variables)] serial_params: &SerialParameters, +) -> Result<(), String> { + #[cfg(all(feature = "prod-build", feature = "kiwi"))] + { + if matches!(serial_params.type_, SerialType::SystemSerialType) { + return Err(format!( + "device type not supported: {}", + serial_params.type_.to_string() + )); + } + if serial_params.stdin { + return Err(format!("parameter not supported: stdin")); + } + } + Ok(()) +} + +pub fn validate_config(_cfg: &mut Config) -> std::result::Result<(), String> { + Ok(()) +} + +#[cfg(feature = "gpu")] +pub fn parse_gpu_options(s: &str) -> Result { + parse_gpu_options_inner(s).map_err(|e| e.to_string()) +} + +#[cfg(feature = "gpu")] +fn parse_gpu_options_inner(s: &str) -> argument::Result { + let mut gpu_params: GpuParameters = Default::default(); + #[cfg(feature = "gfxstream")] + let mut vulkan_specified = false; + #[cfg(feature = "gfxstream")] + let mut syncfd_specified = false; + #[cfg(feature = "gfxstream")] + let mut gles31_specified = false; + #[cfg(feature = "gfxstream")] + let mut angle_specified = false; + + let mut width: Option = None; + let mut height: Option = None; + let mut dpi: Option = None; + let mut display_mode: Option = None; + #[cfg(feature = "gfxstream")] + let mut vsync: Option = None; + let opts = s + .split(',') + .map(|frag| frag.split('=')) + .map(|mut kv| (kv.next().unwrap_or(""), kv.next().unwrap_or(""))); + let mut hidden: Option = None; + + for (k, v) in opts { + match k { + "backend" => match v { + "2d" | "2D" => { + if crate::crosvm::sys::config::is_gpu_backend_deprecated(v) { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "this backend type is deprecated, please use gfxstream.", + ), + }); + } else { + gpu_params.mode = GpuMode::Mode2D; + } + } + "3d" | "3D" | "virglrenderer" => { + if crate::crosvm::sys::config::is_gpu_backend_deprecated(v) { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "this backend type is deprecated, please use gfxstream.", + ), + }); + } else { + gpu_params.mode = GpuMode::ModeVirglRenderer; + } + } + #[cfg(feature = "gfxstream")] + "gfxstream" => { + gpu_params.mode = GpuMode::ModeGfxstream; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + #[cfg(feature = "gfxstream")] + "gpu parameter 'backend' should be one of (2d|virglrenderer|gfxstream)", + #[cfg(not(feature = "gfxstream"))] + "gpu parameter 'backend' should be one of (2d|3d)", + ), + }); + } + }, + "egl" => match v { + "true" | "" => { + gpu_params.renderer_use_egl = true; + } + "false" => { + gpu_params.renderer_use_egl = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'egl' should be a boolean"), + }); + } + }, + "gles" => match v { + "true" | "" => { + gpu_params.renderer_use_gles = true; + } + "false" => { + gpu_params.renderer_use_gles = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'gles' should be a boolean"), + }); + } + }, + "glx" => match v { + "true" | "" => { + gpu_params.renderer_use_glx = true; + } + "false" => { + gpu_params.renderer_use_glx = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'glx' should be a boolean"), + }); + } + }, + "surfaceless" => match v { + "true" | "" => { + gpu_params.renderer_use_surfaceless = true; + } + "false" => { + gpu_params.renderer_use_surfaceless = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'surfaceless' should be a boolean"), + }); + } + }, + #[cfg(feature = "gfxstream")] + "syncfd" => { + syncfd_specified = true; + match v { + "true" | "" => { + gpu_params.gfxstream_use_syncfd = true; + } + "false" => { + gpu_params.gfxstream_use_syncfd = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'syncfd' should be a boolean"), + }); + } + } + } + #[cfg(feature = "gfxstream")] + "angle" => { + angle_specified = true; + match v { + "true" | "" => { + gpu_params.gfxstream_use_guest_angle = true; + } + "false" => { + gpu_params.gfxstream_use_guest_angle = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'angle' should be a boolean"), + }); + } + } + } + "vulkan" => { + #[cfg(feature = "gfxstream")] + { + vulkan_specified = true; + } + match v { + "true" | "" => { + gpu_params.use_vulkan = true; + } + "false" => { + gpu_params.use_vulkan = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'vulkan' should be a boolean"), + }); + } + } + } + #[cfg(feature = "gfxstream")] + "gles3.1" => { + gles31_specified = true; + match v { + "true" | "" => { + gpu_params.gfxstream_support_gles31 = true; + } + "false" => { + gpu_params.gfxstream_support_gles31 = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'gles3.1' should be a boolean"), + }); + } + } + } + "wsi" => match v { + "vk" => { + gpu_params.wsi = Some(RutabagaWsi::Vulkan); + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'wsi' should be vk"), + }); + } + }, + "width" => { + if let Some(width) = width { + return Err(argument::Error::TooManyArguments(format!( + "width was already specified: {}", + width + ))); + } + width = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'width' must be a valid integer"), + })?, + ); + } + "height" => { + if let Some(height) = height { + return Err(argument::Error::TooManyArguments(format!( + "height was already specified: {}", + height + ))); + } + height = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "gpu parameter 'height' must be a valid integer", + ), + })?, + ); + } + "dpi" => { + if let Some(dpi) = dpi { + return Err(argument::Error::TooManyArguments(format!( + "dpi was already specified: {}", + dpi + ))); + } + dpi = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'dpi' must be a valid integer"), + })?, + ); + } + #[cfg(feature = "gfxstream")] + "refresh_rate" => { + if let Some(vsync) = vsync { + return Err(argument::Error::TooManyArguments(format!( + "refresh_rate was already specified: {}", + vsync + ))); + } + vsync = Some( + v.parse::() + .map_err(|_| argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from( + "gpu parameter 'refresh_rate' must be a valid integer", + ), + })?, + ); + } + "display_mode" => { + if let Some(display_mode) = display_mode { + return Err(argument::Error::TooManyArguments(format!( + "display_mode was already specified: {}", + display_mode + ))); + } + display_mode = Some(String::from(v)); + } + "hidden" => match v { + "true" | "" => { + hidden = Some(true); + } + "false" => { + hidden = Some(false); + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'hidden' should be a boolean"), + }); + } + }, + "cache-path" => gpu_params.cache_path = Some(v.to_string()), + "cache-size" => gpu_params.cache_size = Some(v.to_string()), + "udmabuf" => match v { + "true" | "" => { + gpu_params.udmabuf = true; + } + "false" => { + gpu_params.udmabuf = false; + } + _ => { + return Err(argument::Error::InvalidValue { + value: v.to_string(), + expected: String::from("gpu parameter 'udmabuf' should be a boolean"), + }); + } + }, + "context-types" => { + let context_types: Vec = v.split(':').map(|s| s.to_string()).collect(); + gpu_params.context_mask = calculate_context_mask(context_types); + } + "" => {} + _ => { + return Err(argument::Error::UnknownArgument(format!( + "gpu parameter {}", + k + ))); + } + } + } + + match display_mode.as_deref() { + Some("windowed") => gpu_params.display_params = GpuDisplayParameters::default_windowed(), + Some("borderless_full_screen") => gpu_params.display_params = GpuDisplayParameters::default_borderless_full_screen(), + None => {} + Some(display_mode) => return Err(argument::Error::InvalidValue { + value: display_mode.to_string(), + expected: String::from("gpu parameter 'display_mode' must be either 'borderless_full_screen' or 'windowed'") + }) + } + + if let Some(hidden) = hidden { + gpu_params.display_params.hidden = hidden; + } + + #[cfg(feature = "gfxstream")] + { + if let Some(vsync) = vsync { + gpu_params.vsync = vsync; + } + } + + match gpu_params.display_params.display_mode { + GpuDisplayMode::Windowed { + width: ref mut width_in_params, + height: ref mut height_in_params, + dpi: ref mut dpi_in_params, + } => { + if let Some(width) = width { + *width_in_params = width; + } + if let Some(height) = height { + *height_in_params = height; + } + if let Some(dpi) = dpi { + *dpi_in_params = dpi; + } + } + GpuDisplayMode::BorderlessFullScreen(_) => { + if width.is_some() || height.is_some() || dpi.is_some() { + return Err(argument::Error::UnknownArgument( + "width, height, or dpi is only supported for windowed display mode".to_string(), + )); + } + } + } + + #[cfg(feature = "gfxstream")] + { + if !vulkan_specified && gpu_params.mode == GpuMode::ModeGfxstream { + gpu_params.use_vulkan = crate::crosvm::sys::config::use_vulkan(); + } + if syncfd_specified || angle_specified || gles31_specified { + match gpu_params.mode { + GpuMode::ModeGfxstream => {} + _ => { + return Err(argument::Error::UnknownArgument( + "gpu parameters syncfd and gles3.1 are only supported for gfxstream backend" + .to_string(), + )); + } + } + } + } + + Ok(gpu_params) +} + +#[cfg(feature = "gpu")] +pub(crate) fn validate_gpu_config(cfg: &mut Config) -> Result<(), String> { + if let Some(gpu_parameters) = cfg.gpu_parameters.as_ref() { + let (width, height) = gpu_parameters.display_params.get_virtual_display_size(); + for virtio_multi_touch in cfg.virtio_multi_touch.iter_mut() { + virtio_multi_touch.set_default_size(width, height); + } + for virtio_single_touch in cfg.virtio_single_touch.iter_mut() { + virtio_single_touch.set_default_size(width, height); + } + + let dpi = gpu_parameters.display_params.get_dpi(); + info!("using dpi {} on the Android guest", dpi); + cfg.params.push(format!("androidboot.lcd_density={}", dpi)); + } + Ok(()) +} + +/// Each type of process should have its own type here. This affects both exit +/// handling and sandboxing policy. +/// +/// WARNING: do NOT change the values items in this enum. The enum value is used in our exit codes, +/// and relied upon by metrics analysis. The max value for this enum is 0x1F = 31 as it is +/// restricted to five bits per `crate::crosvm::sys::windows::exit::to_process_type_error`. +#[derive(Clone, Copy, PartialEq, Debug, enumn::N)] +#[repr(u8)] +pub enum ProcessType { + Block = 1, + Main = 2, + Metrics = 3, + Net = 4, + Slirp = 5, +} + +impl From for EmulatorProcessType { + fn from(process_type: ProcessType) -> Self { + match process_type { + ProcessType::Block => EmulatorProcessType::PROCESS_TYPE_BLOCK, + ProcessType::Main => EmulatorProcessType::PROCESS_TYPE_MAIN, + ProcessType::Metrics => EmulatorProcessType::PROCESS_TYPE_METRICS, + ProcessType::Net => EmulatorProcessType::PROCESS_TYPE_NET, + ProcessType::Slirp => EmulatorProcessType::PROCESS_TYPE_SLIRP, + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub enum IrqChipKind { + /// All interrupt controllers are emulated in the kernel. + Kernel, + /// APIC is emulated in the kernel. All other interrupt controllers are in userspace. + Split, + /// All interrupt controllers are emulated in userspace. + Userspace, +} + +impl FromStr for IrqChipKind { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "kernel" => Ok(Self::Kernel), + "split" => Ok(Self::Split), + "userspace" => Ok(Self::Userspace), + _ => Err("invalid irqchip kind: expected \"kernel\", \"split\", or \"userspace\""), + } + } +} + +/// Hypervisor backend. +#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)] +pub enum HypervisorKind { + #[cfg(feature = "gvm")] + Gvm, + #[cfg(feature = "haxm")] + Haxm, + #[cfg(feature = "haxm")] + Ghaxm, + #[cfg(feature = "whpx")] + Whpx, +} + +impl FromStr for HypervisorKind { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + #[cfg(feature = "gvm")] + "gvm" => Ok(HypervisorKind::Gvm), + #[cfg(feature = "haxm")] + "haxm" => Ok(HypervisorKind::Haxm), + #[cfg(feature = "haxm")] + "ghaxm" => Ok(HypervisorKind::Ghaxm), + #[cfg(feature = "whpx")] + "whpx" => Ok(HypervisorKind::Whpx), + _ => Err("invalid hypervisor backend"), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[cfg(feature = "gpu")] + use crate::crosvm::sys::config::parse_gpu_options; + #[cfg(feature = "gpu")] + use devices::virtio::gpu::GpuDisplayMode; + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_gfxstream_with_syncfd_specified() { + { + let gpu_params: GpuParameters = + parse_gpu_options("backend=gfxstream,syncfd=true").unwrap(); + + assert!(gpu_params.gfxstream_use_syncfd); + } + { + let gpu_params: GpuParameters = + parse_gpu_options("syncfd=true,backend=gfxstream").unwrap(); + assert!(gpu_params.gfxstream_use_syncfd); + } + { + let gpu_params: GpuParameters = + parse_gpu_options("backend=gfxstream,syncfd=false").unwrap(); + + assert!(!gpu_params.gfxstream_use_syncfd); + } + { + let gpu_params: GpuParameters = + parse_gpu_options("syncfd=false,backend=gfxstream").unwrap(); + assert!(!gpu_params.gfxstream_use_syncfd); + } + { + assert!(parse_gpu_options("backend=gfxstream,syncfd=invalid_value").is_err()); + } + { + assert!(parse_gpu_options("syncfd=invalid_value,backend=gfxstream").is_err()); + } + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_not_gfxstream_with_syncfd_specified() { + { + assert!(parse_gpu_options("backend=virglrenderer,syncfd=true").is_err()); + } + { + assert!(parse_gpu_options("syncfd=true,backend=virglrenderer").is_err()); + } + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_gfxstream_with_wsi_specified() { + { + let gpu_params: GpuParameters = parse_gpu_options("backend=gfxstream,wsi=vk").unwrap(); + assert!(matches!(gpu_params.wsi, Some(RutabagaWsi::Vulkan))); + } + { + let gpu_params: GpuParameters = parse_gpu_options("wsi=vk,backend=gfxstream").unwrap(); + assert!(matches!(gpu_params.wsi, Some(RutabagaWsi::Vulkan))); + } + { + assert!(parse_gpu_options("backend=gfxstream,wsi=invalid_value").is_err()); + } + { + assert!(parse_gpu_options("wsi=invalid_value,backend=gfxstream").is_err()); + } + } + + #[cfg(feature = "audio")] + #[test] + fn parse_ac97_vaild() { + crate::crosvm::config::parse_ac97_options("backend=win_audio") + .expect("parse should have succeded"); + } + + #[cfg(all(feature = "gpu"))] + #[test] + fn parse_gpu_options_default_vulkan_support() { + #[cfg(unix)] + assert!( + !parse_gpu_options("backend=virglrenderer") + .unwrap() + .use_vulkan + ); + #[cfg(feature = "gfxstream")] + assert!(!parse_gpu_options("backend=gfxstream").unwrap().use_vulkan); + #[cfg(all(feature = "gfxstream", unix))] + assert!(parse_gpu_options("backend=gfxstream").unwrap().use_vulkan); + } + + #[cfg(all(feature = "gpu"))] + #[test] + fn parse_gpu_options_with_vulkan_specified() { + assert!(parse_gpu_options("vulkan=true").unwrap().use_vulkan); + #[cfg(unix)] + assert!( + parse_gpu_options("backend=virglrenderer,vulkan=true") + .unwrap() + .use_vulkan + ); + #[cfg(unix)] + assert!( + parse_gpu_options("vulkan=true,backend=virglrenderer") + .unwrap() + .use_vulkan + ); + assert!(!parse_gpu_options("vulkan=false").unwrap().use_vulkan); + #[cfg(unix)] + assert!( + !parse_gpu_options("backend=virglrenderer,vulkan=false") + .unwrap() + .use_vulkan + ); + #[cfg(unix)] + assert!( + !parse_gpu_options("vulkan=false,backend=virglrenderer") + .unwrap() + .use_vulkan + ); + #[cfg(unix)] + assert!(parse_gpu_options("backend=virglrenderer,vulkan=invalid_value").is_err()); + assert!(parse_gpu_options("vulkan=invalid_value,backend=virglrenderer").is_err()); + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_gfxstream_with_gles31_specified() { + assert!( + parse_gpu_options("backend=gfxstream,gles3.1=true") + .unwrap() + .gfxstream_support_gles31 + ); + assert!( + parse_gpu_options("gles3.1=true,backend=gfxstream") + .unwrap() + .gfxstream_support_gles31 + ); + assert!( + !parse_gpu_options("backend=gfxstream,gles3.1=false") + .unwrap() + .gfxstream_support_gles31 + ); + assert!( + !parse_gpu_options("gles3.1=false,backend=gfxstream") + .unwrap() + .gfxstream_support_gles31 + ); + assert!(parse_gpu_options("backend=gfxstream,gles3.1=invalid_value").is_err()); + assert!(parse_gpu_options("gles3.1=invalid_value,backend=gfxstream").is_err()); + } + + #[cfg(all(feature = "gpu", feature = "gfxstream"))] + #[test] + fn parse_gpu_options_not_gfxstream_with_gles31_specified() { + assert!(parse_gpu_options("backend=virglrenderer,gles3.1=true").is_err()); + assert!(parse_gpu_options("gles3.1=true,backend=virglrenderer").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_gpu_display_mode() { + let display_params = parse_gpu_options("display_mode=windowed") + .unwrap() + .display_params; + assert!(matches!( + display_params.display_mode, + GpuDisplayMode::Windowed { .. } + )); + + let display_params = parse_gpu_options("display_mode=borderless_full_screen") + .unwrap() + .display_params; + assert!(matches!( + display_params.display_mode, + GpuDisplayMode::BorderlessFullScreen(_) + )); + + assert!(parse_gpu_options("display_mode=invalid_mode").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_gpu_display_mode_duplicated() { + assert!(parse_gpu_options("display_mode=windowed,display_mode=windowed").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_borderless_full_screen_shouldnt_be_specified_with_size() { + assert!(parse_gpu_options("display_mode=borderless_full_screen,width=1280").is_err()); + assert!(parse_gpu_options("display_mode=borderless_full_screen,height=720").is_err()); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_windowed_with_size() { + const WIDTH: u32 = 1720; + const HEIGHT: u32 = 1800; + const DPI: u32 = 1808; + + let display_params = + parse_gpu_options(format!("display_mode=windowed,width={}", WIDTH).as_str()) + .unwrap() + .display_params; + assert!( + matches!(display_params.display_mode, GpuDisplayMode::Windowed { width, .. } if width == WIDTH) + ); + + let display_params = + parse_gpu_options(format!("display_mode=windowed,height={}", HEIGHT).as_str()) + .unwrap() + .display_params; + assert!( + matches!(display_params.display_mode, GpuDisplayMode::Windowed { height, .. } if height == HEIGHT) + ); + + let display_params = + parse_gpu_options(format!("display_mode=windowed,dpi={}", DPI).as_str()) + .unwrap() + .display_params; + assert!( + matches!(display_params.display_mode, GpuDisplayMode::Windowed { dpi, .. } if dpi == DPI) + ); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_hidden() { + let display_params = parse_gpu_options(format!("hidden=true").as_str()) + .unwrap() + .display_params; + assert!(display_params.hidden); + + let display_params = parse_gpu_options(format!("hidden=false").as_str()) + .unwrap() + .display_params; + assert!(matches!(display_params.hidden, false)); + } + + #[cfg(feature = "gpu")] + #[test] + fn parse_gpu_options_size_duplicated() { + assert!(parse_gpu_options("width=1280,width=1280").is_err()); + assert!(parse_gpu_options("height=1280,height=1280").is_err()); + assert!(parse_gpu_options("dpi=1280,dpi=1280").is_err()); + } +} diff --git a/src/crosvm/sys/windows/exit.rs b/src/crosvm/sys/windows/exit.rs new file mode 100644 index 0000000000..c5e82ed35f --- /dev/null +++ b/src/crosvm/sys/windows/exit.rs @@ -0,0 +1,489 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! Enum and Anyhow helpers to set the process exit code. + +use std::fmt::{self, Display, Formatter}; + +use crate::crosvm::sys::config::ProcessType; +use anyhow::Context; + +pub type ExitCode = i32; + +#[derive(Debug)] +pub struct ExitCodeWrapper(pub ExitCode); + +impl Display for ExitCodeWrapper { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "exit code: {} = 0x{:08x}", self.0, self.0) + } +} + +/// Trait for attaching context with process exit codes to a std::result::Result. +pub trait ExitContext { + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into; + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static; + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C; +} + +impl ExitContext for std::result::Result +where + E: std::error::Error + Send + Sync + 'static, +{ + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into, + { + self.context(ExitCodeWrapper(exit_code.into())) + } + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + { + self.context(ExitCodeWrapper(exit_code.into())) + .context(context) + } + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C, + { + self.context(ExitCodeWrapper(exit_code.into())) + .with_context(f) + } +} + +/// Trait for attaching context with process exit codes to an anyhow::Result. +pub trait ExitContextAnyhow { + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into; + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static; + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C; + + fn to_exit_code(&self) -> Option; +} + +impl ExitContextAnyhow for anyhow::Result { + fn exit_code(self, exit_code: X) -> anyhow::Result + where + X: Into, + { + self.context(ExitCodeWrapper(exit_code.into())) + } + + fn exit_context(self, exit_code: X, context: C) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + { + self.context(ExitCodeWrapper(exit_code.into())) + .context(context) + } + + fn with_exit_context(self, exit_code: X, f: F) -> anyhow::Result + where + X: Into, + C: Display + Send + Sync + 'static, + F: FnOnce() -> C, + { + self.context(ExitCodeWrapper(exit_code.into())) + .with_context(f) + } + + fn to_exit_code(&self) -> Option { + self.as_ref() + .err() + .map(|e| e.downcast_ref::()) + .flatten() + .map(|w| w.0) + } +} + +#[macro_export] +macro_rules! bail_exit_code { + ($exit_code:literal, $msg:literal $(,)?) => { + return Err(anyhow!($msg)).exit_code($exit_code) + }; + ($exit_code:literal, $err:expr $(,)?) => { + return Err(anyhow!($err)).exit_code($exit_code) + }; + ($exit_code:literal, $fmt:expr, $($arg:tt)*) => { + return Err(anyhow!($fmt, $($arg)*)).exit_code($exit_code) + }; + ($exit_code:expr, $msg:literal $(,)?) => { + return Err(anyhow!($msg)).exit_code($exit_code) + }; + ($exit_code:expr, $err:expr $(,)?) => { + return Err(anyhow!($err)).exit_code($exit_code) + }; + ($exit_code:expr, $fmt:expr, $($arg:tt)*) => { + return Err(anyhow!($fmt, $($arg)*)).exit_code($exit_code) + }; +} + +#[macro_export] +macro_rules! ensure_exit_code { + ($cond:expr, $exit_code:literal $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, concat!("Condition failed: `", stringify!($cond), "`")); + } + }; + ($cond:expr, $exit_code:literal, $msg:literal $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $msg); + } + }; + ($cond:expr, $exit_code:literal, $err:expr $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $err); + } + }; + ($cond:expr, $exit_code:literal, $fmt:expr, $($arg:tt)*) => { + if !$cond { + bail_exit_code!($exit_code, $fmt, $($arg)*); + } + }; + ($cond:expr, $exit_code:expr $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, concat!("Condition failed: `", stringify!($cond), "`")); + } + }; + ($cond:expr, $exit_code:expr, $msg:literal $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $msg); + } + }; + ($cond:expr, $exit_code:expr, $err:expr $(,)?) => { + if !$cond { + bail_exit_code!($exit_code, $err); + } + }; + ($cond:expr, $exit_code:expr, $fmt:expr, $($arg:tt)*) => { + if !$cond { + bail_exit_code!($exit_code, $fmt, $($arg)*); + } + }; +} + +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Exit { + // Windows process exit codes triggered by the kernel tend to be NTSTATUS, so we treat + // our error codes as NTSTATUS to avoid clashing. This means we set the vendor bit. We also + // set the severity to error. As these all set in the MSB, we can write this as a prefix of + // 0xE0. + // + // Because of how these error codes are used in CommandType, we can only use the lower two + // bytes of the u32 for our error codes; in other words, the legal range is + // [0xE0000000, 0xE000FFFF]. + AddGpuDeviceMemory = 0xE0000001, + AddIrqChipVcpu = 0xE0000002, + AddPmemDeviceMemory = 0xE0000003, + AllocateGpuDeviceAddress = 0xE0000004, + AllocatePmemDeviceAddress = 0xE0000005, + BlockDeviceNew = 0xE0000006, + BuildVm = 0xE0000007, + ChownTpmStorage = 0xE0000008, + CloneEvent = 0xE000000A, + CloneVcpu = 0xE000000B, + ConfigureVcpu = 0xE000000C, + CreateAc97 = 0xE000000D, + CreateConsole = 0xE000000E, + CreateDisk = 0xE000000F, + CreateEvent = 0xE0000010, + CreateGralloc = 0xE0000011, + CreateGvm = 0xE0000012, + CreateSocket = 0xE0000013, + CreateTapDevice = 0xE0000014, + CreateTimer = 0xE0000015, + CreateTpmStorage = 0xE0000016, + CreateVcpu = 0xE0000017, + CreateWaitContext = 0xE0000018, + Disk = 0xE0000019, + DiskImageLock = 0xE000001A, + DropCapabilities = 0xE000001B, + EventDeviceSetup = 0xE000001C, + EnableHighResTimer = 0xE000001D, + HandleCreateQcowError = 0xE000001E, + HandleVmRequestError = 0xE0000020, + InitSysLogError = 0xE0000021, + InputDeviceNew = 0xE0000022, + InputEventsOpen = 0xE0000023, + InvalidRunArgs = 0xE0000025, + InvalidSubCommand = 0xE0000026, + InvalidSubCommandArgs = 0xE0000027, + InvalidWaylandPath = 0xE0000028, + LoadKernel = 0xE0000029, + MissingCommandArg = 0xE0000030, + ModifyBatteryError = 0xE0000031, + NetDeviceNew = 0xE0000032, + OpenAcpiTable = 0xE0000033, + OpenAndroidFstab = 0xE0000034, + OpenBios = 0xE0000035, + OpenInitrd = 0xE0000036, + OpenKernel = 0xE0000037, + OpenVinput = 0xE0000038, + PivotRootDoesntExist = 0xE0000039, + PmemDeviceImageTooBig = 0xE000003A, + PmemDeviceNew = 0xE000003B, + ReadMemAvailable = 0xE000003C, + RegisterBalloon = 0xE000003D, + RegisterBlock = 0xE000003E, + RegisterGpu = 0xE000003F, + RegisterNet = 0xE0000040, + RegisterP9 = 0xE0000041, + RegisterRng = 0xE0000042, + RegisterWayland = 0xE0000043, + ReserveGpuMemory = 0xE0000044, + ReserveMemory = 0xE0000045, + ReservePmemMemory = 0xE0000046, + ResetTimer = 0xE0000047, + RngDeviceNew = 0xE0000048, + RunnableVcpu = 0xE0000049, + SettingSignalMask = 0xE000004B, + SpawnVcpu = 0xE000004D, + SysUtil = 0xE000004E, + Timer = 0xE000004F, + ValidateRawDescriptor = 0xE0000050, + VirtioPciDev = 0xE0000051, + WaitContextAdd = 0xE0000052, + WaitContextDelete = 0xE0000053, + WhpxSetupError = 0xE0000054, + VcpuFailEntry = 0xE0000055, + VcpuRunError = 0xE0000056, + VcpuShutdown = 0xE0000057, + VcpuSystemEvent = 0xE0000058, + WaitUntilRunnable = 0xE0000059, + CreateControlServer = 0xE000005A, + CreateTube = 0xE000005B, + UsbError = 0xE000005E, + GuestMemoryLayout = 0xE000005F, + CreateVm = 0xE0000060, + CreateGuestMemory = 0xE0000061, + CreateIrqChip = 0xE0000062, + SpawnIrqThread = 0xE0000063, + ConnectTube = 0xE0000064, + BalloonDeviceNew = 0xE0000065, + BalloonStats = 0xE0000066, + BorrowVfioContainer = 0xE0000067, + OpenCompositeFooterFile = 0xE0000068, + OpenCompositeHeaderFile = 0xE0000069, + OpenCompositeImageFile = 0xE0000070, + CreateCompositeDisk = 0xE0000071, + MissingControlTube = 0xE0000072, + TubeTransporterInit = 0xE0000073, + TubeFailure = 0xE0000074, + ProcessSpawnFailed = 0xE0000075, + LogFile = 0xE0000076, + CreateZeroFiller = 0xE0000077, + GenerateAcpi = 0xE0000078, + WaitContextWait = 0xE0000079, + SetSigintHandler = 0xE000007A, + KilledBySignal = 0xE000007B, + BrokerDeviceExitedTimeout = 0xE000007C, + BrokerMainExitedTimeout = 0xE000007D, + MemoryTooLarge = 0xE000007E, + BrokerMetricsExitedTimeout = 0xE000007F, + MetricsController = 0xE0000080, + SwiotlbTooLarge = 0xE0000081, + UserspaceVsockDeviceNew = 0xE0000082, + VhostUserBlockDeviceNew = 0xE0000083, + CrashReportingInit = 0xE0000084, + StartBackendDevice = 0xE0000085, + ConfigureHotPlugDevice = 0xE0000086, + InvalidHotPlugKey = 0xE0000087, + InvalidVfioPath = 0xE0000088, + NoHotPlugBus = 0xE0000089, + SandboxError = 0xE000008A, + Pstore = 0xE000008B, + ProcessInvariantsInit = 0xE000008C, + VirtioVhostUserDeviceNew = 0xE000008D, + CloneTube = 0xE000008E, + VhostUserGpuDeviceNew = 0xE000008F, + CreateAsyncDisk = 0xE0000090, + CreateDiskCheckAsyncOkError = 0xE0000091, + VhostUserNetDeviceNew = 0xE0000092, + BrokerSigtermTimeout = 0xE0000093, + SpawnVcpuMonitor = 0xE0000094, + NoDefaultHypervisor = 0xE0000095, + TscCalibrationFailed = 0xE0000096, + UnknownError = 0xE0000097, + CommonChildSetupError = 0xE0000098, +} + +impl From for ExitCode { + fn from(exit: Exit) -> Self { + exit as ExitCode + } +} + +// Bitfield masks for NTSTATUS & our extension of the format. See to_process_type_error for details. +mod bitmasks { + pub const FACILITY_FIELD_LOWER_MASK: u32 = u32::from_be_bytes([0x00, 0x3F, 0x00, 0x00]); + pub const EXTRA_DATA_FIELD_MASK: u32 = u32::from_be_bytes([0x0F, 0xC0, 0x00, 0x00]); + #[cfg(test)] + pub const EXTRA_DATA_FIELD_COMMAND_TYPE_MASK: u32 = + u32::from_be_bytes([0x07, 0xC0, 0x00, 0x00]); + pub const EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK: u32 = + u32::from_be_bytes([0x08, 0x00, 0x00, 0x00]); + pub const VENDOR_FIELD_MASK: u32 = u32::from_be_bytes([0x20, 0x00, 0x00, 0x00]); + pub const RESERVED_BIT_MASK: u32 = u32::from_be_bytes([0x10, 0x00, 0x00, 0x00]); + pub const COMMAND_TYPE_MASK: u32 = u32::from_be_bytes([0x00, 0x00, 0x00, 0x1F]); +} +use bitmasks::*; + +/// If you are looking for a fun interview question, you have come to the right place. To +/// understand the details of NTSTATUS, which you'll want to do before reading further, visit +/// https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-erref/87fba13e-bf06-450e-83b1-9241dc81e781. +/// +/// This function is unfortunately what happens when you only have six bits to store auxiliary +/// information, and have to fit in with an existing bitfield's schema. +/// +/// This function packs bits in NTSTATUS results (generally what a Windows exit code should be). +/// There are three primary cases it deals with: +/// * Vendor specific exits. These are error codes we generate explicitly in crosvm. We will +/// pack these codes with the lower 6 "facility" bits set so they can't collide with the other +/// cases. The MSB of the facility field will be clear. +/// +/// * Non vendor NTSTATUS exits. These are error codes which come from Windows. We flip the +/// vendor bit on these because we're going to pack the facility field, and leaving it unset +/// would cause us to violate the rule that if the vendor bit is unset, we shouldn't exceed +/// FACILITY_MAXIMUM_VALUE in that field. The MSB of the facility field will be clear. +/// +/// * Non NTSTATUS errors. We detect these with two heuristics: +/// a) Reserved field is set. +/// b) The facility field has exceeded the bottom six bits. +/// +/// For such cases, we pack as much of the error as we can into the lower 6 bits of the +/// facility field, and code field (2 bytes). In this case, the most significant bit of the +/// facility field is set. +/// +/// For all of the cases above, we pack the most significant 5 bits of the facility field with +/// information about what command type generated this error. +pub fn to_process_type_error(error_code: u32, cmd_type: ProcessType) -> u32 { + let is_vendor = error_code & VENDOR_FIELD_MASK != 0; + + // The reserved bit is always clear on a NTSTATUS code. + let is_reserved_bit_clear = error_code & RESERVED_BIT_MASK == 0; + + // The six most significant bits of the facility field are where we'll be storing our + // command type (and whether we have a valid NTSTATUS error). If bits are already set there, + // it means this isn't a valid NTSTATUS code. + let is_extra_data_field_clear = error_code & EXTRA_DATA_FIELD_MASK == 0; + + let is_ntstatus = is_reserved_bit_clear && is_extra_data_field_clear; + + // We use the top bit of the facility field to store whether we ran out of space to pack + // the error. The next five bits are where we store the command type, so we'll shift them + // into the appropriate position here. + let command_type = (cmd_type as u32 & COMMAND_TYPE_MASK) << 22; + + match (is_ntstatus, is_vendor) { + // Valid vendor code + (true, true) => { + // Set all the lower facility bits, and attach the command type. + error_code | FACILITY_FIELD_LOWER_MASK | command_type + } + + // Valid non-vendor code + (true, false) => { + // Set the vendor bit and attach the command type. + error_code | VENDOR_FIELD_MASK | command_type + } + + // Not a valid NTSTATUS code. + _ => { + // Clear the extra data field, and set the the top bit of the facility field to + // signal that we didn't have enough space for the full error codes. + error_code & !EXTRA_DATA_FIELD_MASK | command_type | EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use winapi::shared::ntstatus::STATUS_BAD_INITIAL_PC; + + #[test] + fn test_to_process_type_error_ntstatus_vendor() { + let e = to_process_type_error(Exit::InvalidRunArgs as u32, ProcessType::Main); + assert_eq!( + e & EXTRA_DATA_FIELD_COMMAND_TYPE_MASK, + (ProcessType::Main as u32) << 22 + ); + assert_eq!(e & EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK, 0); + + // This is a valid NTSTATUS error. + assert_eq!(e & RESERVED_BIT_MASK, 0); + + // Check the actual crosvm error code contained in the NTSTATUS. We don't mutate the + // severity field, so we don't mask it off. We mask off the facility field entirely because + // that's where we stored the command type & NTSTATUS validity bit. + assert_eq!(e & 0xF000FFFF_u32, Exit::InvalidRunArgs as u32); + } + + #[test] + fn test_to_process_type_error_ntstatus_non_vendor() { + let e = to_process_type_error(STATUS_BAD_INITIAL_PC as u32, ProcessType::Main); + assert_eq!( + e & EXTRA_DATA_FIELD_COMMAND_TYPE_MASK, + (ProcessType::Main as u32) << 22 + ); + assert_eq!(e & EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK, 0); + + // This is a valid NTSTATUS error. + assert_eq!(e & RESERVED_BIT_MASK, 0); + + // Check the actual error code contained in the NTSTATUS. We mask off all our extra data + // fields and switch off the vendor bit to confirm the actual code was left alone. + assert_eq!( + e & !EXTRA_DATA_FIELD_MASK & !VENDOR_FIELD_MASK, + STATUS_BAD_INITIAL_PC as u32 + ); + } + + #[test] + fn test_to_process_type_error_wontfit_ntstatus() { + let e = to_process_type_error(0xFFFFFFFF, ProcessType::Main); + assert_eq!( + e & EXTRA_DATA_FIELD_COMMAND_TYPE_MASK, + (ProcessType::Main as u32) << 22 + ); + + // -1 is not a valid NTSTATUS error. + assert_ne!(e & RESERVED_BIT_MASK, 0); + + // Overflow did occur. + assert_ne!(e & EXTRA_DATA_FIELD_OVERFLOW_BIT_MASK, 0); + + // Check that we left the rest of the bits (except for our command type field & overflow + // bit) in the exit code untouched. + assert_eq!(e & 0xF03FFFFF_u32, 0xF03FFFFF_u32); + } +} diff --git a/src/crosvm/sys/windows/stats.rs b/src/crosvm/sys/windows/stats.rs new file mode 100644 index 0000000000..259852d172 --- /dev/null +++ b/src/crosvm/sys/windows/stats.rs @@ -0,0 +1,241 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::cmp::Reverse; +use std::fmt; +use std::time::{Duration, Instant}; + +use devices::BusStatistics; +use hypervisor::VcpuExit; + +const ERROR_RETRY_I32: i32 = winapi::shared::winerror::ERROR_RETRY as i32; + +/// Statistics about the number and duration of VM exits. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct VmExitStatistics { + /// Whether or not statistics have been enabled to measure VM exits. + enabled: bool, + /// Counter of the number of VM exits per-exit-type. The index into the Vec can be determined + /// from a &Result via the `exit_to_index` function. + exit_counters: Vec, + /// Sum of the duration of VM exits per-exit-type. The index into the Vec can be determined + /// from a &Result via the `exit_to_index` function. + exit_durations: Vec, +} + +impl VmExitStatistics { + pub fn new() -> VmExitStatistics { + VmExitStatistics { + enabled: false, + // We have a known number of exit types, and thus a known number of exit indices + exit_counters: vec![0; MAX_EXIT_INT + 1], + exit_durations: vec![Duration::new(0, 0); MAX_EXIT_INT + 1], + } + } + + /// Enable or disable statistics gathering. + pub fn set_enabled(&mut self, enabled: bool) { + self.enabled = enabled; + } + + /// Get the start time of the stat that is to be recorded. + /// + /// If the VmExitStatistics instance is not enabled this will return None. + pub fn start_stat(&self) -> Option { + if !self.enabled { + return None; + } + Some(Instant::now()) + } + + /// Record the end of the stat. + /// + /// The start value return from start_stat should be passed as `start`. If `start` is None or + /// if the VmExitStatistics instance is not enabled this will do nothing. The counters and + /// durations will silently overflow to prevent interference with vm operation. + pub fn end_stat(&mut self, exit: &base::Result, start: Option) { + if !self.enabled || start.is_none() { + return; + } + + let exit_index = exit_to_index(exit); + + // We overflow because we don't want any disruptions to emulator running due to + // statistics + self.exit_counters[exit_index] = self.exit_counters[exit_index].overflowing_add(1).0; + self.exit_durations[exit_index] = self.exit_durations[exit_index] + .checked_add(start.unwrap().elapsed()) + .unwrap_or(Duration::new(0, 0)); // If we overflow, reset to 0 + } + + /// Merge several VmExitStatistics into one. + pub fn merged(stats: &[VmExitStatistics]) -> VmExitStatistics { + let mut merged = VmExitStatistics::new(); + for other in stats.iter() { + for exit_index in 0..(MAX_EXIT_INT + 1) { + // We overflow because we don't want any disruptions to emulator running due to + // statistics + merged.exit_counters[exit_index] = merged.exit_counters[exit_index] + .overflowing_add(other.exit_counters[exit_index]) + .0; + merged.exit_durations[exit_index] = merged.exit_durations[exit_index] + .checked_add(other.exit_durations[exit_index]) + .unwrap_or(Duration::new(0, 0)); // If we overflow, reset to 0 + } + } + + merged + } + + /// Get a json representation of `self`. Returns an array of maps, where each map contains the + /// count and duration of a particular vmexit. + pub fn json(&self) -> serde_json::Value { + let mut exits = serde_json::json!([]); + let exits_vec = exits.as_array_mut().unwrap(); + for exit_index in 0..(MAX_EXIT_INT + 1) { + exits_vec.push(serde_json::json!({ + "exit_type": exit_index_to_str(exit_index), + "count": self.exit_counters[exit_index], + "duration": { + "seconds": self.exit_durations[exit_index].as_secs(), + "subsecond_nanos": self.exit_durations[exit_index].subsec_nanos(), + } + })) + } + exits + } +} + +impl std::fmt::Display for VmExitStatistics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "Exit Type Count Duration")?; + + let mut exit_indices: Vec = (0..(MAX_EXIT_INT + 1)).collect(); + // Sort exit indices by exit_duration + exit_indices.sort_by_key(|i| Reverse(self.exit_durations[*i])); + + for exit_index in exit_indices { + writeln!( + f, + "{:<16}{:<16}{:<16}", + exit_index_to_str(exit_index), + self.exit_counters[exit_index], + // Alignment not implemented by Debug + format!("{:?}", self.exit_durations[exit_index]), + )?; + } + + Ok(()) + } +} + +/// This constant should be set to the maximum integer to which the below functions will map a +/// VcpuExit. +const MAX_EXIT_INT: usize = 13; + +/// Map Vm Exits to exit indexes, which are integers for storage in our counter Vecs. +fn exit_to_index(exit: &base::Result) -> usize { + match exit { + Ok(VcpuExit::Io { .. }) => 0, + Ok(VcpuExit::Mmio { .. }) => 1, + Ok(VcpuExit::IoapicEoi { .. }) => 2, + Ok(VcpuExit::IrqWindowOpen) => 3, + Ok(VcpuExit::Hlt) => 4, + Ok(VcpuExit::Shutdown) => 5, + Ok(VcpuExit::FailEntry { .. }) => 6, + Ok(VcpuExit::SystemEventShutdown) => 7, + Ok(VcpuExit::SystemEventReset) => 7, + Ok(VcpuExit::SystemEventCrash) => 7, + Ok(VcpuExit::Intr) => 8, + Ok(VcpuExit::Cpuid { .. }) => 9, + Err(e) if e.errno() == ERROR_RETRY_I32 => 10, + Err(_) => 11, + Ok(VcpuExit::Canceled) => 12, + _ => 13, + } +} + +/// Give human readable names for each exit type that we've mapped to an exit index in exit_to_index. +fn exit_index_to_str(exit: usize) -> String { + (match exit { + 0 => "Io", + 1 => "Mmio", + 2 => "IoapicEoi", + 3 => "IrqWindowOpen", + 4 => "Hlt", + 5 => "Shutdown", + 6 => "FailEntry", + 7 => "SystemEvent", + 8 => "Intr", + 9 => "Cpuid", + 10 => "Retry", + 11 => "Error", + 12 => "Canceled", + _ => "Unknown", + }) + .to_string() +} + +/// Collects, merges, and displays statistics between vcpu threads. +#[derive(Default, Clone, Debug)] +pub struct StatisticsCollector { + pub pio_bus_stats: Vec, + pub mmio_bus_stats: Vec, + pub vm_exit_stats: Vec, +} + +impl StatisticsCollector { + pub fn new() -> StatisticsCollector { + StatisticsCollector::default() + } + + /// Return a merged version of the pio bus statistics, mmio bus statistics, and the vm exit + /// statistics for all vcpus. + fn merged(&self) -> (BusStatistics, BusStatistics, VmExitStatistics) { + ( + BusStatistics::merged(&self.pio_bus_stats), + BusStatistics::merged(&self.mmio_bus_stats), + VmExitStatistics::merged(&self.vm_exit_stats), + ) + } + + /// Get a json representation of `self`. It contains two top-level keys: "vcpus" and "merged". + /// The "vcpus" key's value is a list of per-vcpu stats, where the "merged" stats contains the + /// sum of all vcpu stats. + pub fn json(&self) -> serde_json::Value { + let mut vcpus = serde_json::json!([]); + let vcpus_vec = vcpus.as_array_mut().unwrap(); + + for i in 0..self.pio_bus_stats.len() { + vcpus_vec.push(serde_json::json!({ + "io": self.pio_bus_stats[i].json(), + "mmio": self.mmio_bus_stats[i].json(), + "exits": self.vm_exit_stats[i].json(), + })); + } + + let (pio, mmio, exits) = self.merged(); + + serde_json::json!({ + "merged": { + "io": pio.json(), + "mmio": mmio.json(), + "exits": exits.json(), + }, + "vcpus": vcpus + }) + } +} + +impl std::fmt::Display for StatisticsCollector { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let (pio, mmio, exits) = self.merged(); + writeln!(f, "Port IO:")?; + writeln!(f, "{}", pio)?; + writeln!(f, "MMIO:")?; + writeln!(f, "{}", mmio)?; + writeln!(f, "Vm Exits:")?; + writeln!(f, "{}", exits) + } +} diff --git a/src/main.rs b/src/main.rs index 81a0b48aea..1d810a6d89 100644 --- a/src/main.rs +++ b/src/main.rs @@ -32,8 +32,11 @@ use vm_control::{ BalloonControlCommand, DiskControlCommand, UsbControlResult, VmRequest, VmResponse, }; +use crate::sys::error_to_exit_code; use crate::sys::init_log; use crosvm::cmdline::{Command, CrossPlatformCommands, CrossPlatformDevicesCommands}; +#[cfg(windows)] +use sys::windows::metrics; #[cfg(feature = "scudo")] #[global_allocator] @@ -91,6 +94,12 @@ where } } Ok(cfg) => { + #[cfg(feature = "crash-report")] + crosvm::sys::setup_emulator_crash_reporting(&cfg)?; + + #[cfg(windows)] + metrics::setup_metrics_reporting()?; + init_log(log_config, &cfg)?; let exit_state = crate::sys::run_config(cfg); to_command_status(exit_state) @@ -463,9 +472,17 @@ fn prepare_argh_args>(args_iter: I) -> Vec Result { + let _library_watcher = sys::get_library_watcher(); + + // The following panic hook will stop our crashpad hook on windows. + // Only initialize when the crash-pad feature is off. #[cfg(not(feature = "crash-report"))] sys::set_panic_hook(); + // Ensure all processes detach from metrics on exit. + #[cfg(windows)] + let _metrics_destructor = metrics::get_destructor(); + let args = prepare_argh_args(std::env::args()); let args = args.iter().map(|s| s.as_str()).collect::>(); let args = match crosvm::cmdline::CrosvmCmdlineArgs::from_args(&args[..1], &args[1..]) { @@ -493,6 +510,16 @@ fn crosvm_main() -> Result { // We handle run_vm separately because it does not simply signal success/error // but also indicates whether the guest requested reset or stop. run_vm(cmd, log_config) + } else if let CrossPlatformCommands::Device(cmd) = command { + // On windows, the device command handles its own logging setup, so we can't handle it below + // otherwise logging will double init. + if cfg!(unix) { + syslog::init_with(log_config) + .map_err(|e| anyhow!("failed to initialize syslog: {}", e))?; + } + start_device(cmd) + .map_err(|_| anyhow!("start_device subcommand failed")) + .map(|_| CommandStatus::Success) } else { syslog::init_with(log_config) .map_err(|e| anyhow!("failed to initialize syslog: {}", e))?; @@ -513,9 +540,7 @@ fn crosvm_main() -> Result { CrossPlatformCommands::CreateQcow2(cmd) => { create_qcow2(cmd).map_err(|_| anyhow!("create_qcow2 subcommand failed")) } - CrossPlatformCommands::Device(cmd) => { - start_device(cmd).map_err(|_| anyhow!("start_device subcommand failed")) - } + CrossPlatformCommands::Device(_) => unreachable!(), CrossPlatformCommands::Disk(cmd) => { disk_cmd(cmd).map_err(|_| anyhow!("disk subcommand failed")) } @@ -590,7 +615,7 @@ fn main() { 34 } Err(e) => { - let exit_code = 1; + let exit_code = error_to_exit_code(&res); error!("exiting with error {}:{:?}", exit_code, e); exit_code } @@ -611,6 +636,8 @@ mod tests { assert!(!is_flag("no-leading-dash")); } + // TODO(b/238361778) this doesn't work on Windows because is_flag isn't called yet. + #[cfg(unix)] #[test] fn args_split_long() { assert_eq!( @@ -621,6 +648,8 @@ mod tests { ); } + // TODO(b/238361778) this doesn't work on Windows because is_flag isn't called yet. + #[cfg(unix)] #[test] fn args_split_short() { assert_eq!( diff --git a/src/sys.rs b/src/sys.rs index 7b8f1cf639..0ea17bf646 100644 --- a/src/sys.rs +++ b/src/sys.rs @@ -7,12 +7,22 @@ cfg_if::cfg_if! { pub(crate) mod unix; use unix as platform; pub(crate) use crate::crosvm::sys::unix::{run_config, ExitState}; + } else if #[cfg(windows)] { + pub(crate) mod windows; + use windows as platform; + pub(crate) use windows::ExitState; + pub(crate) use windows::run_config; } else { compile_error!("Unsupported platform"); } } -pub(crate) use platform::main::{cleanup, init_log, run_command, start_device}; +pub(crate) use platform::main::{ + cleanup, error_to_exit_code, get_library_watcher, init_log, run_command, start_device, +}; + +#[cfg(feature = "kiwi")] +pub(crate) use platform::main::sandbox_lower_token; #[cfg(not(feature = "crash-report"))] pub(crate) use platform::set_panic_hook; diff --git a/src/sys/unix/main.rs b/src/sys/unix/main.rs index 383bed5079..99815cf427 100644 --- a/src/sys/unix/main.rs +++ b/src/sys/unix/main.rs @@ -16,7 +16,7 @@ use devices::virtio::vhost::user::device::{ use crate::{ crosvm::sys::cmdline::{Commands, DevicesSubcommand}, - Config, + CommandStatus, Config, }; pub(crate) fn start_device(command: DevicesSubcommand) -> anyhow::Result<()> { @@ -74,6 +74,10 @@ pub(crate) fn cleanup() { } } +pub fn get_library_watcher() -> std::io::Result<()> { + Ok(()) +} + pub(crate) fn run_command(_cmd: Commands) -> anyhow::Result<()> { Err(anyhow::anyhow!("invalid command")) } @@ -88,3 +92,7 @@ where } Ok(()) } + +pub(crate) fn error_to_exit_code(_res: &std::result::Result) -> i32 { + 1 +} diff --git a/src/sys/windows.rs b/src/sys/windows.rs new file mode 100644 index 0000000000..5b3f8431e3 --- /dev/null +++ b/src/sys/windows.rs @@ -0,0 +1,2078 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +pub(crate) mod irq_wait; +pub(crate) mod main; +pub(crate) mod metrics; +#[cfg(not(feature = "crash-report"))] +mod panic_hook; +pub(crate) mod run_vcpu; + +use irq_wait::IrqWaitWorker; +#[cfg(not(feature = "crash-report"))] +pub(crate) use panic_hook::set_panic_hook; +use run_vcpu::{run_all_vcpus, VcpuRunMode}; + +use crate::crosvm::config::{Config, Executable}; +use crate::crosvm::sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}; +use crate::crosvm::sys::windows::stats::StatisticsCollector; + +use crate::sys::windows::metrics::{log_descriptor, MetricEventType}; +use acpi_tables::sdt::SDT; +#[cfg(all(feature = "kiwi", feature = "anti-tamper",))] +use anti_tamper::spawn_dedicated_anti_tamper_thread; +#[cfg(feature = "kiwi")] +use anyhow::ensure; +use anyhow::{anyhow, bail, Context, Result}; +use arch::{self, LinuxArch, RunnableLinuxVm, VirtioDeviceStub, VmComponents, VmImage}; +#[cfg(feature = "kiwi")] +use base::give_foregrounding_permission; +use base::{ + self, enable_high_res_timers, error, info, warn, Event, EventToken, ExternalMapping, + FromRawDescriptor, RawDescriptor, ReadNotifier, RecvTube, SendTube, Tube, TubeError, + VmEventType, WaitContext, +}; +use devices::serial_device::{SerialHardware, SerialParameters}; +use devices::virtio::block::block::DiskOption; +use devices::virtio::{self, BalloonMode, Console, PvClock}; +use devices::Minijail; +use devices::{ + self, get_tsc_sync_mitigations, standard_deviation, Ac97Dev, BusDeviceObj, TscSyncMitigations, + UserspaceIrqChip, VirtioPciDevice, +}; +#[cfg(feature = "haxm")] +use hypervisor::haxm::{get_use_ghaxm, set_use_ghaxm, Haxm, HaxmVcpu, HaxmVm}; +use hypervisor::{ProtectionType, Vm}; +use resources::SystemAllocator; +use rutabaga_gfx::RutabagaGralloc; +#[cfg(feature = "kiwi")] +use std::convert::TryInto; +use std::fs::{File, OpenOptions}; +use std::iter; +use std::mem; +use std::os::windows::fs::OpenOptionsExt; +use std::sync::Arc; +use sync::Mutex; +use tracing; +#[cfg(feature = "kiwi")] +use vm_control::{ + Ac97Control, BalloonControlCommand, + GpuSendToMain::{self, MuteAc97, SendToService}, + PvClockCommand, PvClockCommandResponse, ServiceSendToGpu, +}; +#[cfg(feature = "gvm")] +use { + devices::GvmIrqChip, + hypervisor::gvm::{Gvm, GvmVcpu, GvmVersion, GvmVm}, +}; + +use vm_control::{VmMemoryRequest, VmRunMode}; +use vm_memory::GuestMemory; +use winapi::um::winnt::FILE_SHARE_READ; +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "haxm"))] +use x86_64::{get_cpu_manufacturer, CpuManufacturer}; + +#[cfg(feature = "gpu")] +use { + crate::crosvm::config::TouchDeviceOption, + base::{BlockingMode, FramingMode, StreamChannel}, + gpu_display::EventDevice, + std::collections::BTreeMap, + std::num::NonZeroU8, +}; +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use { + aarch64::AArch64 as Arch, + devices::{IrqChip, IrqChipAArch64 as IrqChipArch}, + hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch}, +}; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use { + devices::IrqChipX86_64 as IrqChipArch, + hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch}, + x86_64::{adjust_cpuid, CpuIdContext, X8664arch as Arch}, +}; + +#[cfg(feature = "whpx")] +use { + devices::WhpxSplitIrqChip, + hypervisor::whpx::{Whpx, WhpxFeature, WhpxVcpu, WhpxVm}, + hypervisor::Hypervisor, + hypervisor::HypervisorCap, + hypervisor::HypervisorX86_64, + std::arch::x86_64::{__cpuid, __cpuid_count}, +}; + +use crate::crosvm::sys::config::{HypervisorKind, IrqChipKind}; +use broker_ipc::{common_child_setup, CommonChildStartupArgs}; +#[cfg(all(feature = "kiwi", feature = "anti-tamper"))] +use service_ipc::request_utilities::prod::MessageToService; +#[cfg(feature = "kiwi")] +use service_ipc::{ + get_balloon_size, request_utilities::prod::MessageFromService, + service_vm_state::ServiceVmState, ServiceIpc, +}; +use tube_transporter::{TubeToken, TubeTransporterReader}; + +const DEFAULT_GUEST_CID: u64 = 3; + +enum TaggedControlTube { + // TODO: handle vm_control messages as they get added. + #[allow(dead_code)] + Vm(Tube), + VmMemory(Tube), + #[cfg(feature = "kiwi")] + GpuServiceComm(Tube), + #[cfg(feature = "kiwi")] + GpuDeviceServiceComm(Tube), +} + +pub enum ExitState { + Reset, + Stop, + Crash, + #[allow(dead_code)] + GuestPanic, +} + +type DeviceResult = Result; + +fn create_vhost_user_block_device(cfg: &Config, disk_device_tube: Tube) -> DeviceResult { + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::vhost::user::vmm::Block::new(features, disk_device_tube).exit_context( + Exit::VhostUserBlockDeviceNew, + "failed to set up vhost-user block device", + )?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_block_device(cfg: &Config, disk: &DiskOption, disk_device_tube: Tube) -> DeviceResult { + // Lock the disk image to prevent other crosvm instances from using it, unless it is read_only. + let share_flags = if disk.read_only { FILE_SHARE_READ } else { 0 }; + let raw_image: File = OpenOptions::new() + .read(true) + .write(!disk.read_only) + .share_mode(share_flags) + .open(&disk.path) + .with_exit_context(Exit::Disk, || { + format!("failed to load disk image {}", disk.path.display()) + })?; + + let disk_file = + disk::create_disk_file(raw_image, disk.sparse, disk::MAX_NESTING_DEPTH, &disk.path) + .exit_context(Exit::CreateAsyncDisk, "failed to create virtual disk")?; + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::Block::new( + features, + disk_file, + disk.read_only, + disk.sparse, + disk.block_size, + disk.id, + Some(disk_device_tube), + ) + .exit_context(Exit::BlockDeviceNew, "failed to create block device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "gpu")] +fn create_gpu_device( + cfg: &Config, + vm_evt_wrtube: &SendTube, + gpu_device_tube: Tube, + resource_bridges: Vec, + event_devices: Vec, + map_request: Arc>>, + #[cfg(feature = "kiwi")] gpu_device_service_tube: Tube, +) -> DeviceResult { + let gpu_parameters = cfg + .gpu_parameters + .as_ref() + .expect("No GPU parameters provided in config!"); + let display_backends = vec![virtio::DisplayBackend::WinAPI( + (&gpu_parameters.display_params).into(), + )]; + + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::Gpu::new( + vm_evt_wrtube + .try_clone() + .exit_context(Exit::CloneTube, "failed to clone tube")?, + Some(gpu_device_tube), + NonZeroU8::new(1).unwrap(), // number of scanouts + resource_bridges, + display_backends, + gpu_parameters, + event_devices, + map_request, + /* external_blob= */ false, + features, + BTreeMap::new(), + #[cfg(feature = "kiwi")] + Some(gpu_device_service_tube), + ); + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "gpu")] +fn create_multi_touch_device( + cfg: &Config, + multi_touch_spec: &TouchDeviceOption, + event_pipe: StreamChannel, + idx: u32, +) -> DeviceResult { + let (width, height) = multi_touch_spec.get_size(); + let dev = virtio::new_multi_touch( + idx, + event_pipe, + width, + height, + virtio::base_features(cfg.protected_vm), + ) + .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "gpu")] +fn create_mouse_device(cfg: &Config, event_pipe: StreamChannel, idx: u32) -> DeviceResult { + let dev = virtio::new_mouse(idx, event_pipe, virtio::base_features(cfg.protected_vm)) + .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "slirp")] +fn create_net_device( + #[cfg(feature = "slirp-ring-capture")] slirp_capture_file: &Option, +) -> DeviceResult { + let dev = virtio::Net::::new_slirp( + #[cfg(feature = "slirp-ring-capture")] + slirp_capture_file, + ) + .exit_context(Exit::NetDeviceNew, "failed to set up virtio networking")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg(feature = "slirp")] +fn create_vhost_user_net_device(cfg: &Config, net_device_tube: Tube) -> DeviceResult { + let features = virtio::base_features(cfg.protected_vm); + let dev = virtio::vhost::user::vmm::Net::new(features, net_device_tube).exit_context( + Exit::VhostUserNetDeviceNew, + "failed to set up vhost-user net device", + )?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_rng_device(cfg: &Config) -> DeviceResult { + let dev = virtio::Rng::new(virtio::base_features(cfg.protected_vm)) + .exit_context(Exit::RngDeviceNew, "failed to set up rng")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_console_device(cfg: &Config, param: &SerialParameters) -> DeviceResult { + let mut keep_rds = Vec::new(); + let evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + let dev = param + .create_serial_device::(cfg.protected_vm, &evt, &mut keep_rds) + .exit_context(Exit::CreateConsole, "failed to create console device")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[allow(dead_code)] // TODO(b/234031017): balloon device startup gets stuck on Windows +fn create_balloon_device( + cfg: &Config, + balloon_device_tube: Tube, + dynamic_mapping_device_tube: Tube, + inflate_tube: Option, + init_balloon_size: u64, +) -> DeviceResult { + let dev = virtio::Balloon::new( + virtio::base_features(cfg.protected_vm), + balloon_device_tube, + dynamic_mapping_device_tube, + inflate_tube, + init_balloon_size, + if cfg.strict_balloon { + BalloonMode::Strict + } else { + BalloonMode::Relaxed + }, + ) + .exit_context(Exit::BalloonDeviceNew, "failed to create balloon")?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +fn create_vsock_device(cfg: &Config) -> DeviceResult { + // We only support a single guest, so we can confidently assign a default + // CID if one isn't provided. We choose the lowest non-reserved value. + let dev = virtio::Vsock::new( + cfg.cid.unwrap_or(DEFAULT_GUEST_CID), + cfg.host_guid.clone(), + virtio::base_features(cfg.protected_vm), + ) + .exit_context( + Exit::UserspaceVsockDeviceNew, + "failed to create userspace vsock device", + )?; + + Ok(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }) +} + +#[cfg_attr(not(feature = "gpu"), allow(unused_variables))] +fn create_virtio_devices( + cfg: &mut Config, + vm_evt_wrtube: &SendTube, + gpu_device_tube: Tube, + disk_device_tubes: &mut Vec, + _balloon_device_tube: Option, + pvclock_device_tube: Option, + _dynamic_mapping_device_tube: Option, + _inflate_tube: Option, + _init_balloon_size: u64, + map_request: Arc>>, + #[cfg(feature = "kiwi")] gpu_device_service_tube: Tube, + tsc_frequency: u64, +) -> DeviceResult> { + let mut devs = Vec::new(); + + if cfg.block_vhost_user_tube.is_empty() { + // Disk devices must precede virtio-console devices or the kernel does not boot. + // TODO(b/171215421): figure out why this ordering is required and fix it. + for disk in &cfg.disks { + let disk_device_tube = disk_device_tubes.remove(0); + devs.push(create_block_device(cfg, disk, disk_device_tube)?); + } + } else { + info!("Starting up vhost user block backends..."); + for _disk in &cfg.disks { + let disk_device_tube = cfg.block_vhost_user_tube.remove(0); + devs.push(create_vhost_user_block_device(cfg, disk_device_tube)?); + } + } + + for (_, param) in cfg + .serial_parameters + .iter() + .filter(|(_k, v)| v.hardware == SerialHardware::VirtioConsole) + { + let dev = create_console_device(cfg, param)?; + devs.push(dev); + } + + if let Some(tube) = pvclock_device_tube { + devs.push(VirtioDeviceStub { + dev: Box::new(PvClock::new(tsc_frequency, tube)), + jail: None, + }); + } + + devs.push(create_rng_device(cfg)?); + + #[cfg(feature = "slirp")] + if let Some(net_vhost_user_tube) = cfg.net_vhost_user_tube.take() { + devs.push(create_vhost_user_net_device(cfg, net_vhost_user_tube)?); + } else { + devs.push(create_net_device( + #[cfg(feature = "slirp-ring-capture")] + &cfg.slirp_capture_file, + )?); + } + + // TODO(b/234031017): balloon device startup gets stuck on Windows + //if let (Some(balloon_device_tube), Some(dynamic_mapping_device_tube)) = + // (balloon_device_tube, dynamic_mapping_device_tube) + //{ + // devs.push(create_balloon_device( + // &cfg, + // balloon_device_tube, + // dynamic_mapping_device_tube, + // inflate_tube, + // init_balloon_size, + // )?); + //} + + devs.push(create_vsock_device(&cfg)?); + + #[cfg(feature = "gpu")] + { + let resource_bridges = Vec::::new(); + let mut event_devices: Vec = Vec::new(); + + if !cfg.virtio_single_touch.is_empty() { + unimplemented!("--single-touch is no longer supported. Use --multi-touch instead."); + } + + for (idx, multi_touch_spec) in cfg.virtio_multi_touch.iter().enumerate() { + let (event_device_pipe, virtio_input_pipe) = + StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte) + .exit_context(Exit::EventDeviceSetup, "failed to set up EventDevice")?; + + devs.push(create_multi_touch_device( + cfg, + multi_touch_spec, + virtio_input_pipe, + idx as u32, + )?); + event_devices.push(EventDevice::touchscreen(event_device_pipe)); + } + + for (idx, _mouse_socket) in cfg.virtio_mice.iter().enumerate() { + let (event_device_pipe, virtio_input_pipe) = + StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte) + .exit_context(Exit::EventDeviceSetup, "failed to set up EventDevice")?; + devs.push(create_mouse_device(cfg, virtio_input_pipe, idx as u32)?); + event_devices.push(EventDevice::mouse(event_device_pipe)); + } + + let (event_device_pipe, virtio_input_pipe) = + StreamChannel::pair(BlockingMode::Nonblocking, FramingMode::Byte) + .exit_context(Exit::EventDeviceSetup, "failed to set up EventDevice")?; + + let dev = virtio::new_keyboard( + /* idx= */ 0, + virtio_input_pipe, + virtio::base_features(cfg.protected_vm), + ) + .exit_context(Exit::InputDeviceNew, "failed to set up input device")?; + devs.push(VirtioDeviceStub { + dev: Box::new(dev), + jail: None, + }); + event_devices.push(EventDevice::keyboard(event_device_pipe)); + + devs.push(create_gpu_device( + cfg, + vm_evt_wrtube, + gpu_device_tube, + resource_bridges, + event_devices, + map_request, + #[cfg(feature = "kiwi")] + gpu_device_service_tube, + )?); + } + + Ok(devs) +} + +fn create_devices( + cfg: &mut Config, + mem: &GuestMemory, + exit_evt_wrtube: &SendTube, + irq_control_tubes: &mut Vec, + gpu_device_tube: Tube, + disk_device_tubes: &mut Vec, + balloon_device_tube: Option, + pvclock_device_tube: Option, + dynamic_mapping_device_tube: Option, + inflate_tube: Option, + init_balloon_size: u64, + map_request: Arc>>, + ac97_device_tubes: Vec, + #[cfg(feature = "kiwi")] gpu_device_service_tube: Tube, + tsc_frequency: u64, +) -> DeviceResult, Option)>> { + let stubs = create_virtio_devices( + cfg, + exit_evt_wrtube, + gpu_device_tube, + disk_device_tubes, + balloon_device_tube, + pvclock_device_tube, + dynamic_mapping_device_tube, + inflate_tube, + init_balloon_size, + map_request, + #[cfg(feature = "kiwi")] + gpu_device_service_tube, + tsc_frequency, + )?; + + let mut pci_devices = Vec::new(); + + for stub in stubs { + let (msi_host_tube, msi_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + irq_control_tubes.push(msi_host_tube); + + let dev = Box::new( + VirtioPciDevice::new( + mem.clone(), + stub.dev, + msi_device_tube, + cfg.disable_virtio_intx, + None, + ) + .exit_context(Exit::VirtioPciDev, "failed to create virtio pci dev")?, + ) as Box; + pci_devices.push((dev, stub.jail)); + } + + if cfg.ac97_parameters.len() != ac97_device_tubes.len() { + panic!( + "{} Ac97 device(s) will be made, but only {} Ac97 device tubes are present.", + cfg.ac97_parameters.len(), + ac97_device_tubes.len() + ); + } + + for (ac97_param, ac97_device_tube) in cfg + .ac97_parameters + .iter() + .zip(ac97_device_tubes.into_iter()) + { + let dev = Ac97Dev::try_new(mem.clone(), ac97_param.clone(), ac97_device_tube) + .exit_context(Exit::CreateAc97, "failed to create ac97 device")?; + pci_devices.push((Box::new(dev), None)); + } + + Ok(pci_devices) +} + +#[cfg(feature = "kiwi")] +fn set_package_name(msg: &MessageFromService) { + match msg { + MessageFromService::HideWindow => { + #[cfg(feature = "crash-report")] + crash_report::set_package_name(""); + + metrics::set_package_name(""); + } + MessageFromService::ShowWindow(ref show) => { + #[cfg(feature = "crash-report")] + crash_report::set_package_name(&show.package_name); + + metrics::set_package_name(&show.package_name); + } + _ => {} + } +} + +#[cfg(feature = "kiwi")] +fn merge_session_invariants(serialized_session_invariants: &[u8]) { + metrics::merge_session_invariants(serialized_session_invariants); +} + +#[derive(Debug)] +struct PvClockError(String); + +/// Sending a pvclock command to the pvclock device can be tricky because we need to wait for a +/// response from the pvclock device if it's running. But, it's possible that the device is not +/// setup yet (or never will be, because the guest doesn't support it). In that case, we want to +/// timeout on recv-ing a response, and to do that we need to do a wait_timeout on the Tube's +/// read_notifier. +#[cfg(feature = "kiwi")] +fn handle_pvclock_request(tube: &Option, command: PvClockCommand) -> Result<()> { + if let Some(ref tube) = tube { + tube.send(&command) + .with_context(|| format!("failed to send pvclock command {:?}", command))?; + + #[derive(EventToken)] + enum Token { + RecvReady, + } + + let wait_ctx = WaitContext::build_with(&[(tube.get_read_notifier(), Token::RecvReady)]) + .context("failed to build pvclock wait context")?; + + let evts = wait_ctx + .wait_timeout(std::time::Duration::from_millis(100)) + .context("failed to wait on pvclock wait context")?; + + ensure!(evts.len() > 0, "timed out waiting for pvclock response"); + + let resp = tube + .recv::() + .context("failed to receive pvclock command response")?; + + if let PvClockCommandResponse::Err(e) = resp { + bail!("pvclock encountered error on {:?}: {}", command, e); + } + } + + Ok(()) +} + +fn run_control( + mut guest_os: RunnableLinuxVm, + sys_allocator: SystemAllocator, + mut control_tubes: Vec, + irq_control_tubes: Vec, + vm_evt_rdtube: RecvTube, + vm_evt_wrtube: SendTube, + broker_shutdown_evt: Option, + balloon_host_tube: Option, + pvclock_host_tube: Option, + map_request: Arc>>, + mut gralloc: RutabagaGralloc, + stats: Option>>, + #[cfg(feature = "kiwi")] service_pipe_name: Option, + ac97_host_tubes: Vec, + memory_size_mb: u64, + host_cpu_topology: bool, + tsc_sync_mitigations: TscSyncMitigations, + force_calibrated_tsc_leaf: bool, +) -> Result { + #[cfg(not(feature = "kiwi"))] + { + // These variable are not used in other configurations. Suppress warnings. + let _ = balloon_host_tube; + let _ = pvclock_host_tube; + let _ = ac97_host_tubes; + let _ = memory_size_mb; + } + + #[derive(EventToken)] + enum Token { + VmEvent, + BrokerShutdown, + VmControl { + index: usize, + }, + #[cfg(feature = "kiwi")] + ServiceIpc, + #[cfg(feature = "proto-tube-hack")] + ProtoIpc, + #[cfg(all(feature = "kiwi", feature = "anti-tamper"))] + AntiTamper, + } + + #[cfg(feature = "kiwi")] + // Note: We use anti_tamper::MAX_CHALLENGE_SIZE because it's the + // largest message passed through the tube. Note the Tube buffer has + // to accomodate the largest message because of b/223807352. + let (ipc_main_loop_tube, ipc_service_ipc_tube) = + Tube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(feature = "proto-tube-hack")] + let (proto_main_loop_tube, proto_service_ipc_tube) = + base::ProtoTube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(feature = "kiwi")] + let _service_ipc = ServiceIpc::start_ipc_listening_loops( + service_pipe_name, + ipc_service_ipc_tube, + #[cfg(feature = "proto-tube-hack")] + proto_service_ipc_tube, + ); + + #[cfg(feature = "kiwi")] + let mut service_vm_state = ServiceVmState::new(); + + let sys_allocator_mutex = Arc::new(Mutex::new(sys_allocator)); + + let exit_evt = Event::new().exit_context(Exit::CreateEvent, "failed to create event")?; + + // Create a separate thread to wait on IRQ events. This is a natural division + // because IRQ interrupts have no dependencies on other events, and this lets + // us avoid approaching the Windows WaitForMultipleObjects 64-object limit. + let irq_join_handle = IrqWaitWorker::start( + exit_evt + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?, + guest_os + .irq_chip + .try_box_clone() + .exit_context(Exit::CloneEvent, "failed to clone irq chip")?, + irq_control_tubes, + sys_allocator_mutex.clone(), + ); + + let wait_ctx = WaitContext::build_with(&[ + (vm_evt_rdtube.get_read_notifier(), Token::VmEvent), + #[cfg(feature = "kiwi")] + (ipc_main_loop_tube.get_read_notifier(), Token::ServiceIpc), + #[cfg(feature = "proto-tube-hack")] + (proto_main_loop_tube.get_read_notifier(), Token::ProtoIpc), + ]) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + if let Some(evt) = broker_shutdown_evt.as_ref() { + wait_ctx.add(evt, Token::BrokerShutdown).exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + + for (index, control_tube) in control_tubes.iter().enumerate() { + #[allow(clippy::single_match)] + match control_tube { + TaggedControlTube::VmMemory(tube) => { + wait_ctx + .add(tube.get_read_notifier(), Token::VmControl { index }) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + #[cfg(feature = "kiwi")] + TaggedControlTube::GpuServiceComm(tube) => { + wait_ctx + .add(tube.get_read_notifier(), Token::VmControl { index }) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + #[cfg(feature = "kiwi")] + TaggedControlTube::GpuDeviceServiceComm(tube) => { + wait_ctx + .add(tube.get_read_notifier(), Token::VmControl { index }) + .exit_context( + Exit::WaitContextAdd, + "failed to add trigger to wait context", + )?; + } + // TODO(nkgold): as new control tubes are added, we'll need to add support for them + _ => (), + } + } + + let vcpus: Vec> = match guest_os.vcpus.take() { + Some(vec) => vec.into_iter().map(|vcpu| Some(vcpu)).collect(), + None => iter::repeat_with(|| None) + .take(guest_os.vcpu_count) + .collect(), + }; + + #[cfg(all( + feature = "kiwi", + feature = "anti-tamper", + not(feature = "proto-tube-hack") + ))] + let (anti_tamper_main_thread_tube, anti_tamper_dedicated_thread_tube) = + Tube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(all(feature = "kiwi", feature = "anti-tamper", feature = "proto-tube-hack"))] + let (anti_tamper_main_thread_tube, anti_tamper_dedicated_thread_tube) = + base::ProtoTube::pair_with_buffer_size(anti_tamper::MAX_CHALLENGE_SIZE) + .expect("Could not create Tube::pair()!"); + + #[cfg(all(feature = "kiwi", feature = "anti-tamper",))] + if let Err(_e) = wait_ctx.add( + anti_tamper_main_thread_tube.get_read_notifier(), + Token::AntiTamper, + ) { + #[cfg(debug_assertions)] + error!("Failed to add anti-tamper tube to wait_ctx: {}", _e); + } + + #[cfg(all(feature = "kiwi", feature = "anti-tamper",))] + spawn_dedicated_anti_tamper_thread(anti_tamper_dedicated_thread_tube); + + if sandbox::is_sandbox_target() { + sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "failed to create sandbox")? + .expect("Could not create sandbox!") + .lower_token(); + } + + let vcpu_boxes: Arc>>> = Arc::new(Mutex::new(Vec::new())); + let run_mode_arc = Arc::new(VcpuRunMode::default()); + let vcpu_threads = run_all_vcpus( + vcpus, + vcpu_boxes.clone(), + &guest_os, + &exit_evt, + &vm_evt_wrtube, + &pvclock_host_tube, + &stats, + host_cpu_topology, + run_mode_arc.clone(), + tsc_sync_mitigations, + force_calibrated_tsc_leaf, + )?; + let mut exit_state = ExitState::Stop; + + // TODO: udam b/142733266 (sandboxing) registerwaitforsingleobject to wait on + // child processes when they exit + 'poll: loop { + let events = { + match wait_ctx.wait() { + Ok(v) => v, + Err(e) => { + error!("failed to wait: {}", e); + break; + } + } + }; + + let mut vm_control_indices_to_remove = Vec::new(); + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + Token::VmEvent => match vm_evt_rdtube.recv::() { + Ok(vm_event) => { + match vm_event { + VmEventType::Exit => { + info!("vcpu requested shutdown"); + exit_state = ExitState::Stop; + } + VmEventType::Reset => { + info!("vcpu requested reset"); + exit_state = ExitState::Reset; + } + VmEventType::Crash => { + info!("vcpu crashed"); + exit_state = ExitState::Crash; + } + VmEventType::Panic(_) => { + error!("got pvpanic event. this event is not expected on Windows."); + } + } + break 'poll; + } + Err(e) => { + warn!("failed to recv VmEvent: {}", e); + } + }, + Token::BrokerShutdown => { + info!("main loop got broker shutdown event"); + break 'poll; + } + Token::VmControl { index } => { + if let Some(tube) = control_tubes.get(index) { + #[allow(clippy::single_match)] + match tube { + TaggedControlTube::VmMemory(tube) => { + match tube.recv::() { + Ok(request) => { + let response = request.execute( + &mut guest_os.vm, + &mut sys_allocator_mutex.lock(), + Arc::clone(&map_request), + &mut gralloc, + ); + if let Err(e) = tube.send(&response) { + error!("failed to send VmMemoryControlResponse: {}", e); + } + } + Err(e) => { + if let TubeError::Disconnected = e { + vm_control_indices_to_remove.push(index); + } else { + error!("failed to recv VmMemoryControlRequest: {}", e); + } + } + } + } + #[cfg(feature = "kiwi")] + TaggedControlTube::GpuServiceComm(tube) + | TaggedControlTube::GpuDeviceServiceComm(tube) => { + match tube.recv::() { + Ok(request) => { + #[cfg(feature = "kiwi")] + { + match request { + SendToService(service_request) => { + if let Err(e) = ipc_main_loop_tube.send( + &service_vm_state + .update_gpu_state_and_generate_message_to_service(&service_request), + ) { + error!( + "Failed to send message to ServiceIpc: {}", + e + ); + } + } + MuteAc97(mute) => { + for ac97_host_tube in &ac97_host_tubes { + ac97_host_tube + .send(&Ac97Control::Mute(mute)) + .expect("Could not send mute message!"); + } + service_vm_state.update_audio_state(mute); + if let Err(e) = ipc_main_loop_tube.send( + &service_vm_state + .generate_send_state_message(), + ) { + error!( + "Failed to send message to ServiceIpc: {}", + e + ); + } + + } + } + } + #[cfg(not(feature = "kiwi"))] + { + info!("Dropping message: {:?}", request); + } + } + Err(e) => { + error!( + "Error when receiving message from GpuServiceComm or GpuDeviceServiceComm tube: {}", + e + ); + } + } + } + _ => (), + // TODO: handle vm_control messages. + /* TaggedControlTube::Vm(tube) => match tube.recv::() { + Ok(request) => { + let mut run_mode_opt = None; + let response = request.execute( + &mut run_mode_opt, + disk_host_tubes, + ); + if let Err(e) = tube.send(&response) { + error!("failed to send VmResponse: {}", e); + } + if let Some(run_mode) = run_mode_opt { + info!("control tube changed run mode to {}", run_mode); + match run_mode { + VmRunMode::Exiting => { + break 'poll; + } + } + } + } + Err(e) => { + if let TubeError::Disconnected = e { + vm_control_indices_to_remove.push(index); + } else { + error!("failed to recv VmRequest: {}", e); + } + } + }, */ + } + } + } + #[cfg(feature = "proto-tube-hack")] + Token::ProtoIpc => { + anti_tamper::forward_security_challenge( + &proto_main_loop_tube, + &anti_tamper_main_thread_tube, + ); + } + // For handling service to crosvm messages. At this point, it is up to the dev how + // they want to get the datagram to their component. It's recommended to use + // Tubes if it can't be sent directly. + #[cfg(feature = "kiwi")] + Token::ServiceIpc => match ipc_main_loop_tube.recv::() { + Ok(request) => match request { + MessageFromService::ShowWindow(_) + | MessageFromService::HideWindow + | MessageFromService::Shutdown + | MessageFromService::MouseInputMode(_) => { + set_package_name(&request); + for control_tube in &control_tubes { + if let TaggedControlTube::GpuServiceComm(tube) = &control_tube { + if let Err(e) = + tube.send::(&request.try_into().expect( + "Could not convert to ServiceSendToGpu request!", + )) + { + error!("Failed to send message to GPU display: {}", e); + } + break; + } + } + } + MessageFromService::SetVmMemorySize(balloon_request) => { + info!( + "Service requested balloon adjustment, requested vm size: {}mb", + balloon_request.get_vm_memory_size_mb() + ); + if let Some(ref balloon_host_tube) = balloon_host_tube { + if let Err(e) = + balloon_host_tube.send(&BalloonControlCommand::Adjust { + num_bytes: get_balloon_size( + memory_size_mb, + &balloon_request, + ), + }) + { + error!("Failed to modify balloon size - tube closed: {}", e); + } + } else { + error!("Failed to modify balloon size - balloon disabled"); + } + } + MessageFromService::Suspend => { + info!("Received suspend request from the service"); + // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM. + run_mode_arc.set_and_notify(VmRunMode::Suspending); + + // Force all vcpus to exit from the hypervisor + for vcpu in vcpu_boxes.lock().iter() { + vcpu.set_immediate_exit(true); + } + guest_os.irq_chip.kick_halted_vcpus(); + + handle_pvclock_request(&pvclock_host_tube, PvClockCommand::Suspend) + .unwrap_or_else(|e| { + error!("Error handling pvclock suspend: {:?}", e) + }); + } + MessageFromService::Resume => { + info!("Received resume request from the service"); + handle_pvclock_request(&pvclock_host_tube, PvClockCommand::Resume) + .unwrap_or_else(|e| { + error!("Error handling pvclock resume: {:?}", e) + }); + + // Make sure any immediate exit bits are disabled + for vcpu in vcpu_boxes.lock().iter() { + vcpu.set_immediate_exit(false); + } + + run_mode_arc.set_and_notify(VmRunMode::Running); + } + #[cfg(any(not(feature = "anti-tamper"), feature = "proto-tube-hack"))] + MessageFromService::ReceiveSecurityChallenge(_) => {} + #[cfg(all(feature = "anti-tamper", not(feature = "proto-tube-hack")))] + MessageFromService::ReceiveSecurityChallenge(security_challenge) => { + if let Err(_e) = anti_tamper_main_thread_tube.send(&security_challenge) + { + #[cfg(debug_assertions)] + error!( + "Failed to send challenge program to anti-tamper thread: {}", + _e + ); + } + } + // Receive a mute request when the service receives lock/unlock screen event. The + // mute request should only be received if the window is NOT hidden (the service + // is responsible for that). + MessageFromService::AudioState(set_audio_state_request) => { + for ac97_host_tube in &ac97_host_tubes { + ac97_host_tube + .send(&Ac97Control::Mute(set_audio_state_request.get_is_mute())) + .expect("Could not send mute message!"); + } + service_vm_state + .update_audio_state(set_audio_state_request.get_is_mute()); + + if let Err(e) = ipc_main_loop_tube + .send(&service_vm_state.generate_send_state_message()) + { + error!("Failed to send message to ServiceIpc: {}", e); + } + } + MessageFromService::GetForegroundingPermission( + foregrounding_permission_request, + ) => { + // Perform best-effort, but do not block on failure + // TODO(b/205917759): Move this to gpu process + let mut result = false; + if let Err(e) = give_foregrounding_permission( + foregrounding_permission_request.get_process_id(), + ) { + error!("Failed to give foregrounding permission: {}", e); + } else { + result = true; + } + + if let Err(e) = ipc_main_loop_tube.send( + &MessageToService::SendForegroundingPermissionResult(result.into()), + ) { + // Log, but otherwise ignore failures to send as they are + // handleable and non-fatal. + error!( + "Failed to send foregrounding permission result to the service: {}", + e + ); + } + } + MessageFromService::MergeSessionInvariants(session_invariants_request) => { + let serialized_session_invariants = + session_invariants_request.get_serialized_session_invariants(); + merge_session_invariants(serialized_session_invariants); + } + + MessageFromService::SetAuthToken(set_auth_token_request) => { + metrics::set_auth_token(set_auth_token_request.get_auth_token()); + } + MessageFromService::UploadCrashReport => { + #[cfg(feature = "crash-report")] + crash_report::upload_crash_report("anr"); + + #[cfg(not(feature = "crash-report"))] + info!("Dropping UploadCrashReport message"); + } + MessageFromService::SystemHealthRequest => { + // Reply back with an empty report as there are no system health metrics + // to report yet. + if let Err(e) = + ipc_main_loop_tube.send(&MessageToService::SendSystemHealthReport()) + { + #[cfg(debug_assertions)] + error!("Failed to send system health report to the service: {}", e); + } + } + }, + Err(_e) => {} + }, + #[cfg(all( + feature = "kiwi", + feature = "anti-tamper", + not(feature = "proto-tube-hack") + ))] + Token::AntiTamper => { + match anti_tamper_main_thread_tube.recv::() { + Ok(msg) => { + if let Err(_e) = ipc_main_loop_tube.send(&msg) { + #[cfg(debug_assertions)] + error!("Failed to send anti-tamper signal to the service: {}", _e); + } + } + Err(_e) => { + #[cfg(debug_assertions)] + error!( + "Failed to receive challenge signal from anti-tamper thread: {}", + _e + ); + } + } + } + #[cfg(all(feature = "kiwi", feature = "anti-tamper", feature = "proto-tube-hack"))] + Token::AntiTamper => anti_tamper::forward_security_signal( + &anti_tamper_main_thread_tube, + &ipc_main_loop_tube, + ), + } + } + for event in events.iter().filter(|e| e.is_hungup) { + match event.token { + Token::VmEvent | Token::BrokerShutdown => {} + #[allow(unused_variables)] + Token::VmControl { index } => { + // TODO: handle vm control messages as they get ported. + // It's possible more data is readable and buffered while the tube is hungup, + // so don't delete the tube from the poll context until we're sure all the + // data is read. + /*match control_tubes + .get(index) + .map(|s| s.as_ref().get_readable_bytes()) + { + Some(Ok(0)) | Some(Err(_)) => vm_control_indices_to_remove.push(index), + Some(Ok(x)) => info!("control index {} has {} bytes readable", index, x), + _ => {} + }*/ + } + #[cfg(feature = "proto-tube-hack")] + Token::ProtoIpc => {} + #[cfg(feature = "kiwi")] + Token::ServiceIpc => {} + #[cfg(all(feature = "kiwi", feature = "anti-tamper"))] + Token::AntiTamper => {} + } + } + + // Sort in reverse so the highest indexes are removed first. This removal algorithm + // preserved correct indexes as each element is removed. + //vm_control_indices_to_remove.sort_unstable_by(|a, b| b.cmp(a)); + vm_control_indices_to_remove.dedup(); + for index in vm_control_indices_to_remove { + control_tubes.swap_remove(index); + /*if let Some(tube) = control_tubes.get(index) { + wait_ctx + .modify( + tube, Token::VmControl { index }, + EventType::Read + ) + .exit_context(Exit::WaitContextAdd, "failed to add trigger to wait context")?; + }*/ + } + } + + // VCPU threads MUST see the VmRunMode flag, otherwise they may re-enter the VM. + run_mode_arc.set_and_notify(VmRunMode::Exiting); + + // Force all vcpus to exit from the hypervisor + for vcpu in vcpu_boxes.lock().iter() { + vcpu.set_immediate_exit(true); + } + + let mut res = Ok(exit_state); + guest_os.irq_chip.kick_halted_vcpus(); + let _ = exit_evt.write(1); + // Ensure any child threads have ended by sending the Exit vm event (possibly again) to ensure + // their run loops are aborted. + let _ = vm_evt_wrtube.send::(&VmEventType::Exit); + for (i, thread) in vcpu_threads.into_iter().enumerate() { + // wait till all the threads exit, so that guest_os.vm arc memory count is down to 1. + // otherwise, we will hit a memory leak if we force kill the thread with terminate. + match thread.join() { + Ok(Err(e)) => { + error!("vcpu thread {} exited with an error: {}", i, e); + res = Err(e); + } + Ok(_) => {} + Err(e) => error!("vcpu thread {} panicked: {:?}", i, e), + } + } + + // This cancels all the outstanding and any future blocking operations. + // TODO(b/196911556): Shutdown executor for cleaner shutdown. Given we are using global, for a + // cleaner shutdown we have to call disarm so that all the incoming requests are run and are + // cancelled. If we call shutdown all blocking threads will go away and incoming operations + // won't be scheduled to run and will be dropped leading to panic. I think ideal place to call + // shutdown is when we drop non-global executor. + cros_async::unblock_disarm(); + + let _ = irq_join_handle.join(); + + if let Some(stats) = stats { + println!("Statistics Collected:\n{}", stats.lock()); + println!("Statistics JSON:\n{}", stats.lock().json()); + } + + // Explicitly drop the VM structure here to allow the devices to clean up before the + // control tubes are closed when this function exits. + mem::drop(guest_os); + + res +} + +#[cfg(feature = "gvm")] +const GVM_MINIMUM_VERSION: GvmVersion = GvmVersion { + major: 1, + minor: 4, + patch: 1, +}; + +#[cfg(feature = "gvm")] +fn create_gvm(mem: GuestMemory) -> Result { + info!("Creating GVM"); + let gvm = Gvm::new()?; + match gvm.get_full_version() { + Ok(version) => { + if version < GVM_MINIMUM_VERSION { + error!( + "GVM version {} is below minimum version {}", + version, GVM_MINIMUM_VERSION + ); + return Err(base::Error::new(libc::ENXIO).into()); + } else { + info!("Using GVM version {}.", version) + } + } + Err(e) => { + error!("unable to determine gvm version: {}", e); + return Err(base::Error::new(libc::ENXIO).into()); + } + } + let vm = GvmVm::new(&gvm, mem)?; + Ok(vm) +} + +#[cfg(feature = "haxm")] +fn create_haxm(mem: GuestMemory, kernel_log_file: &Option) -> Result { + info!("Creating HAXM ghaxm={}", get_use_ghaxm()); + let haxm = Haxm::new()?; + let vm = HaxmVm::new(&haxm, mem)?; + if let Some(path) = kernel_log_file { + use hypervisor::haxm::HAX_CAP_VM_LOG; + if vm.check_raw_capability(HAX_CAP_VM_LOG) { + match vm.register_log_file(&path) { + Ok(_) => {} + Err(e) => match e.errno() { + libc::E2BIG => { + error!( + "kernel_log_file path is too long, kernel log file will not be written" + ); + } + _ => return Err(e.into()), + }, + } + } else { + warn!( + "kernel_log_file specified but this version of HAXM does not support kernel log \ + files" + ); + } + } + Ok(vm) +} + +#[cfg(feature = "whpx")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn create_whpx( + mem: GuestMemory, + cpu_count: usize, + no_smt: bool, + apic_emulation: bool, + force_calibrated_tsc_leaf: bool, +) -> Result { + info!("Creating Whpx"); + let whpx = Whpx::new()?; + + // context for non-cpu-specific cpuid results + let ctx = CpuIdContext::new( + 0, + cpu_count, + no_smt, + /*host_cpu_topology=*/ false, + None, + /* enable_pnp_data */ false, + /* itmt */ false, + force_calibrated_tsc_leaf, + whpx.check_capability(HypervisorCap::CalibratedTscLeafRequired), + __cpuid_count, + __cpuid, + ); + + // Get all cpuid entries that we should pre-set + let mut cpuid = whpx.get_supported_cpuid()?; + + // Adjust them for crosvm + for entry in cpuid.cpu_id_entries.iter_mut() { + adjust_cpuid(entry, &ctx); + } + + let vm = WhpxVm::new(&whpx, cpu_count, mem, cpuid, apic_emulation) + .exit_context(Exit::WhpxSetupError, "failed to create WHPX vm")?; + + Ok(vm) +} + +#[cfg(feature = "gvm")] +fn create_gvm_irq_chip(vm: &GvmVm, vcpu_count: usize) -> base::Result { + info!("Creating GVM irqchip"); + let irq_chip = GvmIrqChip::new(vm.try_clone()?, vcpu_count)?; + Ok(irq_chip) +} + +#[cfg(feature = "whpx")] +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +fn create_whpx_split_irq_chip( + vm: &WhpxVm, + ioapic_device_tube: Tube, +) -> base::Result { + info!("Creating WHPX split irqchip"); + WhpxSplitIrqChip::new( + vm.try_clone()?, + ioapic_device_tube, + None, // ioapic_pins + ) +} + +fn create_userspace_irq_chip( + vcpu_count: usize, + ioapic_device_tube: Tube, +) -> base::Result> +where + Vm: VmArch + 'static, + Vcpu: VcpuArch + 'static, +{ + info!("Creating userspace irqchip"); + let irq_chip = + UserspaceIrqChip::new(vcpu_count, ioapic_device_tube, /*ioapic_pins:*/ None)?; + Ok(irq_chip) +} + +pub fn get_default_hypervisor() -> Result { + // The ordering here matters from most preferable to the least. + #[cfg(feature = "whpx")] + match hypervisor::whpx::Whpx::is_enabled() { + true => return Ok(HypervisorKind::Whpx), + false => warn!("Whpx not enabled."), + }; + #[cfg(feature = "haxm")] + if get_cpu_manufacturer() == CpuManufacturer::Intel { + // Make sure Haxm device can be opened before selecting it. + match Haxm::new() { + Ok(_) => return Ok(HypervisorKind::Ghaxm), + Err(e) => warn!("Cannot initialize HAXM: {}", e), + }; + } + #[cfg(feature = "gvm")] + // Make sure Gvm device can be opened before selecting it. + match Gvm::new() { + Ok(_) => return Ok(HypervisorKind::Gvm), + Err(e) => warn!("Cannot initialize GVM: {}", e), + }; + bail!("no hypervisor enabled!"); +} + +fn setup_vm_components(cfg: &Config) -> Result { + let initrd_image = if let Some(initrd_path) = &cfg.initrd_path { + Some( + File::open(initrd_path).with_exit_context(Exit::OpenInitrd, || { + format!("failed to open initrd {}", initrd_path.display()) + })?, + ) + } else { + None + }; + + let vm_image = match cfg.executable_path { + Some(Executable::Kernel(ref kernel_path)) => VmImage::Kernel( + File::open(kernel_path).with_exit_context(Exit::OpenKernel, || { + format!("failed to open kernel image {}", kernel_path.display(),) + })?, + ), + Some(Executable::Bios(ref bios_path)) => { + VmImage::Bios(File::open(bios_path).with_exit_context(Exit::OpenBios, || { + format!("failed to open bios {}", bios_path.display()) + })?) + } + _ => panic!("Did not receive a bios or kernel, should be impossible."), + }; + + let swiotlb = if let Some(size) = cfg.swiotlb { + Some( + size.checked_mul(1024 * 1024) + .ok_or_else(|| anyhow!("requested swiotlb size too large"))?, + ) + } else { + match cfg.protected_vm { + ProtectionType::Protected | ProtectionType::ProtectedWithoutFirmware => { + Some(64 * 1024 * 1024) + } + ProtectionType::Unprotected | ProtectionType::UnprotectedWithFirmware => None, + } + }; + + Ok(VmComponents { + memory_size: cfg + .memory + .unwrap_or(256) + .checked_mul(1024 * 1024) + .ok_or_else(|| anyhow!("requested memory size too large"))?, + swiotlb, + vcpu_count: cfg.vcpu_count.unwrap_or(1), + vcpu_affinity: cfg.vcpu_affinity.clone(), + cpu_clusters: cfg.cpu_clusters.clone(), + cpu_capacity: cfg.cpu_capacity.clone(), + no_smt: cfg.no_smt, + hugepages: cfg.hugepages, + vm_image, + android_fstab: cfg + .android_fstab + .as_ref() + .map(|x| { + File::open(x).with_exit_context(Exit::OpenAndroidFstab, || { + format!("failed to open android fstab file {}", x.display()) + }) + }) + .map_or(Ok(None), |v| v.map(Some))?, + pstore: cfg.pstore.clone(), + initrd_image, + extra_kernel_params: cfg.params.clone(), + acpi_sdts: cfg + .acpi_tables + .iter() + .map(|path| { + SDT::from_file(path).with_exit_context(Exit::OpenAcpiTable, || { + format!("failed to open ACPI file {}", path.display()) + }) + }) + .collect::>>()?, + rt_cpus: cfg.rt_cpus.clone(), + delay_rt: cfg.delay_rt, + protected_vm: cfg.protected_vm, + dmi_path: cfg.dmi_path.clone(), + no_i8042: cfg.no_i8042, + no_rtc: cfg.no_rtc, + host_cpu_topology: cfg.host_cpu_topology, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + force_s2idle: cfg.force_s2idle, + itmt: false, + pvm_fw: None, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pci_low_start: cfg.pci_low_start, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pcie_ecam: cfg.pcie_ecam, + }) +} + +// Enum that allows us to assign a variable to what is essentially a &dyn IrqChipArch. +enum WindowsIrqChip { + Userspace(UserspaceIrqChip), + #[cfg(feature = "gvm")] + Gvm(GvmIrqChip), + #[cfg(feature = "whpx")] + WhpxSplit(WhpxSplitIrqChip), +} + +impl WindowsIrqChip { + // Convert our enum to a &mut dyn IrqChipArch + fn as_mut(&mut self) -> &mut dyn IrqChipArch { + match self { + WindowsIrqChip::Userspace(i) => i, + #[cfg(feature = "gvm")] + WindowsIrqChip::Gvm(i) => i, + #[cfg(feature = "whpx")] + WindowsIrqChip::WhpxSplit(i) => i, + } + } +} + +/// Storage for the VM TSC offset for each vcpu. Stored in a static because the tracing thread will +/// need access to it when tracing is enabled. +static TSC_OFFSETS: once_cell::sync::Lazy>>> = + once_cell::sync::Lazy::new(|| sync::Mutex::new(Vec::new())); + +/// Save the TSC offset for a particular vcpu. +/// +/// After setting the TSC offset for a vcpu, this function checks the standard deviation of offsets +/// for all the VCPUs and logs this information. If the TSC offsets differ too much between vcpus +/// it can cause clock issues in the guest. +pub fn save_vcpu_tsc_offset(offset: u64, vcpu_id: usize) { + let offsets_copy = { + let mut offsets = TSC_OFFSETS.lock(); + // make sure offsets vec is large enough before inserting + let newlen = std::cmp::max(offsets.len(), vcpu_id + 1); + offsets.resize(newlen, None); + offsets[vcpu_id] = Some(offset); + + offsets.clone() + }; + + // do statistics on a clone of the offsets so we don't hold up other vcpus at this point + info!( + "TSC offset standard deviation is: {}", + standard_deviation( + &offsets_copy + .iter() + .filter(|x| x.is_some()) + .map(|x| x.unwrap() as u128) + .collect::>() + ) + ); +} + +/// Get the TSC offset of any vcpu. It will pick the first non-None offset it finds in TSC_OFFSETS. +#[cfg(feature = "cperfetto")] +pub fn get_vcpu_tsc_offset() -> u64 { + for offset in TSC_OFFSETS.lock().iter() { + if let Some(offset) = offset { + return *offset; + } + } + 0 +} + +/// Callback that is registered with tracing crate, and will be called by the tracing thread when +/// tracing is enabled or disabled. Regardless of whether tracing is being enabled or disabled for +/// a given category or instance, we just emit a clock snapshot that maps the guest TSC to the +/// host TSC. Redundant snapshots should not be a problem for perfetto. +#[cfg(feature = "cperfetto")] +fn set_tsc_clock_snapshot() { + let freq = match devices::tsc_frequency() { + Err(e) => { + error!( + "Could not determine tsc frequency, unable to snapshot tsc offset: {}", + e + ); + return; + } + Ok(freq) => freq, + }; + + // The offset is host-guest tsc value + let offset = get_vcpu_tsc_offset(); + // Safe because _rdtsc takes no arguments; + let host_tsc = unsafe { std::arch::x86_64::_rdtsc() }; + perfetto::snapshot_clock(perfetto::ClockSnapshot::new( + // Technically our multiplier should be freq/1_000_000_000, but perfetto doesn't + // support floating point multipliers yet. So for now we set the freq in Hz and rely + // on the merge tool to fix it. + perfetto::Clock::new( + perfetto::BuiltinClock::Tsc as u32, + host_tsc.wrapping_add(offset), + ) + .set_multiplier(freq as u64), + perfetto::Clock::new( + // The host builtin clock ids are all offset from the guest ids by + // HOST_GUEST_CLOCK_ID_OFFSET when the traces are merged. Because this snapshot + // contains both a guest and host clock, we need to offset it before merge. + perfetto::BuiltinClock::Tsc as u32 + tracing::HOST_GUEST_CLOCK_ID_OFFSET, + host_tsc, + ) + .set_multiplier(freq as u64), + )); +} + +/// Launches run_config for the broker, reading configuration from a TubeTransporter. +pub fn run_config_for_broker(raw_tube_transporter: RawDescriptor) -> Result { + // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that + // the blocking & framing modes are accurate because we create them ourselves in the broker. + let tube_transporter = + unsafe { TubeTransporterReader::from_raw_descriptor(raw_tube_transporter) }; + + let mut tube_data_list = tube_transporter + .read_tubes() + .exit_context(Exit::TubeTransporterInit, "failed to init tube transporter")?; + + let bootstrap_tube = tube_data_list + .get_tube(TubeToken::Bootstrap) + .exit_context(Exit::TubeFailure, "failed to get bootstrap tube")?; + + let mut cfg: Config = bootstrap_tube + .recv::() + .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; + + let startup_args: CommonChildStartupArgs = bootstrap_tube + .recv::() + .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?; + let _child_cleanup = common_child_setup(startup_args).exit_context( + Exit::CommonChildSetupError, + "failed to perform common child setup", + )?; + + cfg.broker_shutdown_event = Some( + bootstrap_tube + .recv::() + .exit_context(Exit::TubeFailure, "failed to read bootstrap tube")?, + ); + + run_config_inner(cfg) +} + +pub fn run_config(cfg: Config) -> Result { + let _raise_timer_resolution = enable_high_res_timers() + .exit_context(Exit::EnableHighResTimer, "failed to enable high res timer")?; + run_config_inner(cfg) +} + +fn run_config_inner(cfg: Config) -> Result { + #[cfg(feature = "kiwi")] + { + let use_vulkan = if cfg!(feature = "gpu") { + match &cfg.gpu_parameters { + Some(params) => Some(params.use_vulkan), + None => None, + } + } else { + None + }; + anti_tamper::setup_common_metric_invariants( + &&cfg.product_version, + &cfg.product_channel, + &use_vulkan, + ); + } + + tracing::init(); + #[cfg(feature = "cperfetto")] + tracing::add_per_trace_callback(set_tsc_clock_snapshot); + + let components: VmComponents = setup_vm_components(&cfg)?; + + let guest_mem_layout = Arch::guest_memory_layout(&components).exit_context( + Exit::GuestMemoryLayout, + "failed to create guest memory layout", + )?; + let guest_mem = GuestMemory::new(&guest_mem_layout) + .exit_context(Exit::CreateGuestMemory, "failed to create guest memory")?; + + let default_hypervisor = get_default_hypervisor() + .exit_context(Exit::NoDefaultHypervisor, "no enabled hypervisor")?; + #[allow(unused_mut)] + let mut hypervisor = cfg.hypervisor.unwrap_or(default_hypervisor); + + #[cfg(feature = "whpx")] + if hypervisor::whpx::Whpx::is_enabled() { + // If WHPX is enabled, no other hypervisor can be used, so just override it + hypervisor = HypervisorKind::Whpx; + } + + match hypervisor { + #[cfg(feature = "haxm")] + HypervisorKind::Haxm | HypervisorKind::Ghaxm => { + if hypervisor == HypervisorKind::Haxm { + set_use_ghaxm(false); + } + let vm = create_haxm(guest_mem, &cfg.kernel_log_file)?; + let (ioapic_host_tube, ioapic_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + let irq_chip = create_userspace_irq_chip::( + components.vcpu_count, + ioapic_device_tube, + )?; + run_vm::( + cfg, + components, + vm, + WindowsIrqChip::Userspace(irq_chip).as_mut(), + Some(ioapic_host_tube), + ) + } + #[cfg(feature = "whpx")] + HypervisorKind::Whpx => { + let apic_emulation_supported = + Whpx::check_whpx_feature(WhpxFeature::LocalApicEmulation) + .exit_context(Exit::WhpxSetupError, "failed to set up whpx")?; + + let no_smt = cfg.no_smt; + + // Default to WhpxSplitIrqChip if it's supported because it's more performant + let irq_chip = cfg.irq_chip.unwrap_or(if apic_emulation_supported { + IrqChipKind::Split + } else { + IrqChipKind::Userspace + }); + + // Both WHPX irq chips use a userspace IOAPIC + let (ioapic_host_tube, ioapic_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + + let vm = create_whpx( + guest_mem, + components.vcpu_count, + no_smt, + apic_emulation_supported && irq_chip == IrqChipKind::Split, + cfg.force_calibrated_tsc_leaf, + )?; + + let mut irq_chip = match irq_chip { + IrqChipKind::Kernel => unimplemented!("Kernel irqchip mode not supported by WHPX"), + IrqChipKind::Split => { + if !apic_emulation_supported { + panic!( + "split irqchip specified but your WHPX version does not support \ + local apic emulation" + ); + } + WindowsIrqChip::WhpxSplit(create_whpx_split_irq_chip(&vm, ioapic_device_tube)?) + } + IrqChipKind::Userspace => { + WindowsIrqChip::Userspace(create_userspace_irq_chip::( + components.vcpu_count, + ioapic_device_tube, + )?) + } + }; + run_vm::( + cfg, + components, + vm, + irq_chip.as_mut(), + Some(ioapic_host_tube), + ) + } + #[cfg(feature = "gvm")] + HypervisorKind::Gvm => { + let vm = create_gvm(guest_mem)?; + let ioapic_host_tube; + let mut irq_chip = match cfg.irq_chip.unwrap_or(IrqChipKind::Kernel) { + IrqChipKind::Split => unimplemented!("Split irqchip mode not supported by GVM"), + IrqChipKind::Kernel => { + ioapic_host_tube = None; + WindowsIrqChip::Gvm(create_gvm_irq_chip(&vm, components.vcpu_count)?) + } + IrqChipKind::Userspace => { + let (host_tube, ioapic_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + ioapic_host_tube = Some(host_tube); + WindowsIrqChip::Userspace(create_userspace_irq_chip::( + components.vcpu_count, + ioapic_device_tube, + )?) + } + }; + run_vm::(cfg, components, vm, irq_chip.as_mut(), ioapic_host_tube) + } + } +} + +fn run_vm( + #[allow(unused_mut)] mut cfg: Config, + #[allow(unused_mut)] mut components: VmComponents, + mut vm: V, + irq_chip: &mut dyn IrqChipArch, + ioapic_host_tube: Option, +) -> Result +where + Vcpu: VcpuArch + 'static, + V: VmArch + 'static, +{ + let vm_memory_size_mb = components.memory_size / (1024 * 1024); + let mut control_tubes = Vec::new(); + let mut irq_control_tubes = Vec::new(); + // Create one control tube per disk. + let mut disk_device_tubes = Vec::new(); + let mut disk_host_tubes = Vec::new(); + let disk_count = cfg.disks.len(); + for _ in 0..disk_count { + let (disk_host_tube, disk_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + disk_host_tubes.push(disk_host_tube); + disk_device_tubes.push(disk_device_tube); + } + let (gpu_host_tube, gpu_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(gpu_host_tube)); + + if let Some(ioapic_host_tube) = ioapic_host_tube { + irq_control_tubes.push(ioapic_host_tube); + } + + // Balloon gets a special socket so balloon requests can be forwarded from the main process. + let (balloon_host_tube, balloon_device_tube) = if cfg.balloon { + let (balloon_host_tube, balloon_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + (Some(balloon_host_tube), Some(balloon_device_tube)) + } else { + (None, None) + }; + // The balloon device also needs a tube to communicate back to the main process to + // handle remapping memory dynamically. + let dynamic_mapping_device_tube = if cfg.balloon { + let (dynamic_mapping_host_tube, dynamic_mapping_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::VmMemory(dynamic_mapping_host_tube)); + Some(dynamic_mapping_device_tube) + } else { + None + }; + + // PvClock gets a tube for handling suspend/resume requests from the main thread. + let (pvclock_host_tube, pvclock_device_tube) = if cfg.pvclock { + let (host, device) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + (Some(host), Some(device)) + } else { + (None, None) + }; + + #[cfg(feature = "kiwi")] + { + if cfg.service_pipe_name.is_some() { + let (gpu_main_host_tube, gpu_main_display_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::GpuServiceComm(gpu_main_host_tube)); + let mut gpu_parameters = cfg + .gpu_parameters + .as_mut() + .expect("missing GpuParameters in config"); + gpu_parameters.display_params.gpu_main_display_tube = + Some(Arc::new(Mutex::new(gpu_main_display_tube))); + } + }; + + // Create a ServiceComm tube to pass to the gpu device + #[cfg(feature = "kiwi")] + let gpu_device_service_tube = { + let (gpu_device_service_tube, gpu_device_service_host_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + control_tubes.push(TaggedControlTube::GpuDeviceServiceComm( + gpu_device_service_host_tube, + )); + gpu_device_service_tube + }; + + let gralloc = + RutabagaGralloc::new().exit_context(Exit::CreateGralloc, "failed to create gralloc")?; + let map_request: Arc>> = Arc::new(Mutex::new(None)); + + let (vm_evt_wrtube, vm_evt_rdtube) = + Tube::directional_pair().context("failed to create vm event tube")?; + let pstore_size = components.pstore.as_ref().map(|pstore| pstore.size as u64); + let mut sys_allocator = SystemAllocator::new( + Arch::get_system_allocator_config(&vm), + pstore_size, + &cfg.mmio_address_ranges, + ) + .context("failed to create system allocator")?; + + let mut ac97_host_tubes = Vec::new(); + let mut ac97_device_tubes = Vec::new(); + for _ in &cfg.ac97_parameters { + let (ac97_host_tube, ac97_device_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + ac97_host_tubes.push(ac97_host_tube); + ac97_device_tubes.push(ac97_device_tube); + } + + // Allocate the ramoops region first. + let ramoops_region = match &components.pstore { + Some(pstore) => Some( + arch::pstore::create_memory_region( + &mut vm, + sys_allocator.reserved_region().unwrap(), + &pstore, + ) + .exit_context(Exit::Pstore, "failed to allocate pstore region")?, + ), + None => None, + }; + + let init_balloon_size = components + .memory_size + .checked_sub(cfg.init_memory.map_or(components.memory_size, |m| { + m.checked_mul(1024 * 1024).unwrap_or(u64::MAX) + })) + .context("failed to calculate init balloon size")?; + + let tsc_state = devices::tsc_state().exit_code(Exit::TscCalibrationFailed)?; + let tsc_sync_mitigations = get_tsc_sync_mitigations(&tsc_state, components.vcpu_count); + + if tsc_state.core_grouping.size() > 1 { + // Host TSCs are not in sync, log a metric about it. + warn!( + "Host TSCs are not in sync, applying the following mitigations: {:?}", + tsc_sync_mitigations + ); + log_descriptor( + MetricEventType::TscCoresOutOfSync, + // casting u64 as i64 is a no-op, so we don't lose any part of the bitmask + tsc_state.core_grouping.core_grouping_bitmask() as i64, + ); + } + + let pci_devices = create_devices( + &mut cfg, + vm.get_memory(), + &vm_evt_wrtube, + &mut irq_control_tubes, + gpu_device_tube, + &mut disk_device_tubes, + balloon_device_tube, + pvclock_device_tube, + dynamic_mapping_device_tube, + /* inflate_tube= */ None, + init_balloon_size, + Arc::clone(&map_request), + ac97_host_tubes, + #[cfg(feature = "kiwi")] + gpu_device_service_tube, + tsc_state.frequency, + )?; + + let mut vcpu_ids = Vec::new(); + + let windows = Arch::build_vm::( + components, + &vm_evt_wrtube, + &mut sys_allocator, + &cfg.serial_parameters, + None, + (&cfg.battery_type, None), + vm, + ramoops_region, + pci_devices, + irq_chip, + &mut vcpu_ids, + /*debugcon_jail=*/ None, + ) + .exit_context(Exit::BuildVm, "the architecture failed to build the vm")?; + + let _render_node_host = (); + + let stats = if cfg.exit_stats { + Some(Arc::new(Mutex::new(StatisticsCollector::new()))) + } else { + None + }; + + run_control( + windows, + sys_allocator, + control_tubes, + irq_control_tubes, + vm_evt_rdtube, + vm_evt_wrtube, + cfg.broker_shutdown_event.take(), + balloon_host_tube, + pvclock_host_tube, + Arc::clone(&map_request), + gralloc, + stats, + #[cfg(feature = "kiwi")] + cfg.service_pipe_name, + ac97_device_tubes, + vm_memory_size_mb, + cfg.host_cpu_topology, + tsc_sync_mitigations, + cfg.force_calibrated_tsc_leaf, + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn create_config(test_dir: &TempDir) -> Config { + let mut config = Config::default(); + + let dummy_kernel_path = test_dir.path().join("dummy_kernel.txt"); + OpenOptions::new() + .create(true) + .write(true) + .open(&dummy_kernel_path) + .expect("Could not open file!"); + config.executable_path = Some(Executable::Kernel(dummy_kernel_path)); + + config + } + + #[test] + #[should_panic(expected = "Did not receive a bios or kernel")] + fn setup_vm_components_panics_when_no_kernel_provided() { + let mut config = + create_config(&TempDir::new().expect("Could not create temporary directory!")); + config.executable_path = None; + let _ = setup_vm_components(&config); + } + + #[test] + fn setup_vm_components_stores_memory_in_bytes() { + let tempdir = TempDir::new().expect("Could not create temporary directory!"); + let mut config = create_config(&tempdir); + config.memory = Some(1); + let vm_components = setup_vm_components(&config).expect("failed to setup vm components"); + assert_eq!(vm_components.memory_size, 1024 * 1024); + } + + #[test] + fn setup_vm_components_fails_when_memory_too_large() { + let tempdir = TempDir::new().expect("Could not create temporary directory!"); + let mut config = create_config(&tempdir); + // One mb more than a u64 can hold in bytes + config.memory = Some((u64::MAX / 1024 / 1024) + 1); + setup_vm_components(&config).err().expect("expected error"); + } +} diff --git a/src/sys/windows/irq_wait.rs b/src/sys/windows/irq_wait.rs new file mode 100644 index 0000000000..8fe773c300 --- /dev/null +++ b/src/sys/windows/irq_wait.rs @@ -0,0 +1,364 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +//! Handles the main wait loop for IRQs. +//! Should be started on a background thread. + +use base::{ + error, info, warn, Event, EventToken, ReadNotifier, Result, Tube, TubeError, WaitContext, + MAXIMUM_WAIT_OBJECTS, +}; +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use devices::IrqChipAArch64 as IrqChipArch; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use devices::IrqChipX86_64 as IrqChipArch; +use devices::{IrqEdgeEvent, IrqEventIndex, IrqEventSource}; +use metrics::{log_high_frequency_descriptor_event, MetricEventType}; +use resources::SystemAllocator; +use std::collections::HashMap; +use std::sync::Arc; +use std::thread::{self, JoinHandle}; +use std::time::{Duration, Instant}; +use sync::Mutex; +use vm_control::{IrqSetup, VmIrqRequest}; + +pub struct IrqWaitWorker { + exit_evt: Event, + irq_chip: Box, + irq_control_tubes: Vec, + sys_allocator: Arc>, +} + +impl IrqWaitWorker { + pub fn start( + exit_evt: Event, + irq_chip: Box, + irq_control_tubes: Vec, + sys_allocator: Arc>, + ) -> JoinHandle> { + let mut irq_worker = IrqWaitWorker { + exit_evt, + irq_chip, + irq_control_tubes, + sys_allocator, + }; + thread::Builder::new() + .name("irq_wait_loop".into()) + .spawn(move || irq_worker.run()) + .unwrap() + } + + fn run(&mut self) -> Result<()> { + #[derive(EventToken)] + enum Token { + Exit, + VmControl { index: usize }, + DelayedIrqEvent, + } + + let wait_ctx = WaitContext::build_with(&[(&self.exit_evt, Token::Exit)])?; + + let mut max_event_index: usize = 0; + let mut vm_control_added_irq_events: Vec = Vec::new(); + let mut irq_event_sources: HashMap = HashMap::new(); + // TODO(b/190828888): Move irq logging into the irqchip impls. + let irq_frequencies = Arc::new(Mutex::new(vec![0; max_event_index + 1])); + let irq_events = self.irq_chip.irq_event_tokens()?; + let mut children = vec![]; + + let (mut child_wait_ctx, child_join_handle) = IrqWaitWorkerChild::start( + self.exit_evt.try_clone()?, + self.irq_chip.try_box_clone()?, + irq_frequencies.clone(), + )?; + children.push(child_join_handle); + + for (event_index, source, evt) in irq_events { + child_wait_ctx.add(&evt, ChildToken::IrqEvent { event_index })?; + max_event_index = std::cmp::max(max_event_index, event_index); + irq_event_sources.insert(event_index, source); + + vm_control_added_irq_events.push(evt); + } + + irq_frequencies.lock().resize(max_event_index + 1, 0); + + for (index, control_tube) in self.irq_control_tubes.iter().enumerate() { + wait_ctx.add(control_tube.get_read_notifier(), Token::VmControl { index })?; + } + + let mut _delayed_event_token: Option = None; + if let Some(delayed_token) = self.irq_chip.irq_delayed_event_token()? { + wait_ctx.add(&delayed_token, Token::DelayedIrqEvent)?; + // store the token, so that it lasts outside this scope. + // We must store the event as try_clone creates a new event. It won't keep + // the current event valid that is waited on inside wait_ctx. + _delayed_event_token = Some(delayed_token); + } + + let mut intr_stat_sample_time = Instant::now(); + + 'poll: loop { + let events = { + match wait_ctx.wait() { + Ok(v) => v, + Err(e) => { + error!("failed to wait on irq thread: {}", e); + break 'poll; + } + } + }; + + let mut vm_control_indices_to_remove = Vec::new(); + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + Token::Exit => { + info!("irq event loop got exit event"); + break 'poll; + } + Token::VmControl { index } => { + if let Some(tube) = self.irq_control_tubes.get(index) { + match tube.recv::() { + Ok(request) => { + let response = { + let irq_chip = &mut self.irq_chip; + let exit_evt = &self.exit_evt; + // TODO(b/229262201): Refactor the closure into a standalone function to reduce indentation. + request.execute( + |setup| match setup { + IrqSetup::Event( + irq, + ev, + device_id, + queue_id, + device_name, + ) => { + let irqevent = IrqEdgeEvent::from_event( + ev.try_clone() + .expect("Failed to clone irq event."), + ); + let source = IrqEventSource { + device_id: device_id.try_into()?, + queue_id, + device_name, + }; + let event_index = irq_chip + .register_edge_irq_event( + irq, + &irqevent, + source.clone(), + )?; + if let Some(event_index) = event_index { + max_event_index = std::cmp::max( + event_index, + irq as usize, + ); + irq_frequencies + .lock() + .resize(max_event_index + 1, 0); + irq_event_sources + .insert(event_index, source); + // Make new thread if needed, including buffer space for any + // events we didn't explicitly add (exit/reset/etc) + if irq_event_sources.len() + % (MAXIMUM_WAIT_OBJECTS - 3) + == 0 + { + // The child wait thread has reached max capacity, we + // need to add another. + let (new_wait_ctx, child_join_handle) = + IrqWaitWorkerChild::start( + exit_evt.try_clone()?, + irq_chip.try_box_clone()?, + irq_frequencies.clone(), + )?; + child_wait_ctx = new_wait_ctx; + children.push(child_join_handle); + } + let irqevent = + irqevent.get_trigger().try_clone()?; + match child_wait_ctx.add( + &irqevent, + ChildToken::IrqEvent { event_index }, + ) { + Err(e) => { + warn!("failed to add IrqEvent to synchronization context: {}", e); + Err(e) + }, + Ok(_) => { + vm_control_added_irq_events + .push(irqevent); + Ok(()) + } + } + } else { + Ok(()) + } + } + IrqSetup::Route(route) => irq_chip.route_irq(route), + IrqSetup::UnRegister(irq, ev) => irq_chip + .unregister_edge_irq_event( + irq, + &IrqEdgeEvent::from_event(ev.try_clone()?), + ), + }, + &mut self.sys_allocator.lock(), + ) + }; + if let Err(e) = tube.send(&response) { + error!("failed to send VmIrqResponse: {}", e); + } + } + Err(e) => { + if let TubeError::Disconnected = e { + vm_control_indices_to_remove.push(index); + } else { + error!("failed to recv VmIrqRequest: {}", e); + } + } + } + } + } + Token::DelayedIrqEvent => { + if let Err(e) = self.irq_chip.process_delayed_irq_events() { + warn!("can't deliver delayed irqs: {}", e); + } + } + } + } + + let now = Instant::now(); + let intr_stat_duration = now.duration_since(intr_stat_sample_time); + + // include interrupt stats every 10 seconds + if intr_stat_duration > Duration::from_secs(10) { + let mut event_indices: Vec<(&usize, &IrqEventSource)> = + irq_event_sources.iter().collect(); + // sort the devices by irq_frequency + let mut locked_irq_frequencies = irq_frequencies.lock(); + event_indices + .sort_by_key(|(idx, _)| std::cmp::Reverse(locked_irq_frequencies[**idx])); + let rates: Vec = event_indices + .iter() + .filter(|(idx, _)| locked_irq_frequencies[**idx] > 0) + .map(|(idx, source)| { + let rate = locked_irq_frequencies[**idx] / intr_stat_duration.as_secs(); + // As the descriptor, use a 64bit int containing two 32bit ids. + // low bits: queue_id, high bits: device_id + let descriptor_bytes: [u8; 8] = { + let mut bytes: [u8; 8] = [0; 8]; + for (i, byte) in + (source.queue_id as u32).to_le_bytes().iter().enumerate() + { + bytes[i] = *byte + } + let device_id: u32 = source.device_id.into(); + for (i, byte) in device_id.to_le_bytes().iter().enumerate() { + bytes[i + 4] = *byte + } + bytes + }; + log_high_frequency_descriptor_event( + MetricEventType::Interrupts, + i64::from_le_bytes(descriptor_bytes), + rate as i64, + ); + format!("{}({})->{}/s", source.device_name, source.queue_id, rate,) + }) + .collect(); + + info!("crosvm-interrupt-rates: {}", rates.join(", ")); + + // reset sample time and counters + intr_stat_sample_time = now; + *locked_irq_frequencies = vec![0; max_event_index + 1]; + } + + vm_control_indices_to_remove.dedup(); + for index in vm_control_indices_to_remove { + self.irq_control_tubes.swap_remove(index); + } + } + + // Ensure all children have ended by firing off the exit event again to make sure the loop + // is exited, and joining to ensure none are hanging. + let _ = self.exit_evt.write(1); + for child in children { + match child.join() { + Ok(Err(e)) => warn!("IRQ woker child ended in error: {}", e), + Err(e) => warn!("IRQ worker child panicked with error: {:?}", e), + _ => {} + } + } + + Ok(()) + } +} + +#[derive(EventToken)] +enum ChildToken { + Exit, + IrqEvent { event_index: IrqEventIndex }, +} +/// An arbitrarily expandible worker for waiting on irq events. +/// This worker is responsible for hadling the irq events, whereas +/// the parent worker's job is just to handle the irq control tube requests. +struct IrqWaitWorkerChild { + wait_ctx: Arc>, + exit_evt: Event, + irq_chip: Box, + irq_frequencies: Arc>>, +} + +impl IrqWaitWorkerChild { + fn start( + exit_evt: Event, + irq_chip: Box, + irq_frequencies: Arc>>, + ) -> Result<(Arc>, JoinHandle>)> { + let child_wait_ctx = Arc::new(WaitContext::new()?); + let mut child = IrqWaitWorkerChild { + wait_ctx: child_wait_ctx.clone(), + exit_evt, + irq_chip, + irq_frequencies, + }; + let join_handle = thread::Builder::new() + .name("irq_child_wait_loop".into()) + .spawn(move || child.run())?; + + Ok((child_wait_ctx, join_handle)) + } + + fn run(&mut self) -> Result<()> { + self.wait_ctx.add(&self.exit_evt, ChildToken::Exit)?; + 'poll: loop { + let events = { + match self.wait_ctx.wait() { + Ok(v) => v, + Err(e) => { + error!("failed to wait on irq child thread: {}", e); + break 'poll; + } + } + }; + + for event in events.iter().filter(|e| e.is_readable) { + match event.token { + ChildToken::Exit => { + info!("irq child event loop got exit event"); + break 'poll; + } + ChildToken::IrqEvent { event_index } => { + self.irq_frequencies.lock()[event_index] += 1; + if let Err(e) = self.irq_chip.service_irq_event(event_index) { + error!("failed to signal irq {}: {}", event_index, e); + } + } + } + } + } + Ok(()) + } +} diff --git a/src/sys/windows/main.rs b/src/sys/windows/main.rs new file mode 100644 index 0000000000..9fba29c304 --- /dev/null +++ b/src/sys/windows/main.rs @@ -0,0 +1,247 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use anyhow::{anyhow, Result}; +use argh::FromArgs; +use base::{ + info, + syslog::{self, LogConfig}, + FromRawDescriptor, RawDescriptor, +}; +use broker_ipc::{common_child_setup, CommonChildStartupArgs}; +use metrics::{ + self, + event_details_proto::{EmulatorDllDetails, RecordDetails}, + MetricEventType, +}; +#[cfg(all(feature = "slirp"))] +use net_util::slirp::sys::windows::SlirpStartupConfig; +use tube_transporter::{TubeToken, TubeTransporterReader}; +use win_util::{DllNotificationData, DllWatcher}; + +use std::collections::HashSet; +use std::ffi::OsString; +use std::fs::OpenOptions; + +use crate::{ + crosvm::{ + argument::{self, Argument}, + cmdline::RunCommand, + sys::cmdline::{Commands, DevicesSubcommand}, + sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}, + }, + metrics::run_metrics, + CommandStatus, Config, +}; + +#[cfg(all(feature = "slirp"))] +pub(crate) fn run_slirp(args: Vec) -> Result<()> { + let arguments = &[Argument::value( + "bootstrap", + "TRANSPORT_TUBE_RD", + "TubeTransporter descriptor used to bootstrap the Slirp process.", + )]; + + let raw_transport_tube = set_bootstrap_arguments(args, arguments) + .exit_context(Exit::InvalidSubCommandArgs, "error in setting slirp args")?; + + // Safe because we know that raw_transport_tube is valid (passed by inheritance), + // and that the blocking & framing modes are accurate because we create them ourselves + // in the broker. + let tube_transporter = + unsafe { TubeTransporterReader::from_raw_descriptor(raw_transport_tube.unwrap()) }; + + let mut tube_data_list = tube_transporter + .read_tubes() + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + let bootstrap_tube = tube_data_list.get_tube(TubeToken::Bootstrap).unwrap(); + + let startup_args: CommonChildStartupArgs = + bootstrap_tube.recv::().unwrap(); + let _child_cleanup = common_child_setup(startup_args).exit_context( + Exit::CommonChildSetupError, + "failed to perform common child setup", + )?; + + let slirp_config = bootstrap_tube.recv::().unwrap(); + + if let Some(mut target) = sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")? + { + target.lower_token(); + } + + net_util::Slirp::run_slirp_process( + slirp_config.slirp_pipe, + slirp_config.shutdown_event, + #[cfg(feature = "slirp-ring-capture")] + slirp_config.slirp_capture_file, + ); + Ok(()) +} + +pub fn run_broker_impl(cfg: Config) -> Result<()> { + tracing::init(); + Ok(crate::crosvm::sys::windows::broker::run(cfg)?) +} + +pub fn initialize_sandbox() -> Result<()> { + if sandbox::is_sandbox_target() { + // Get the TargetServices pointer so that it gets initialized. + let _ = sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")?; + } + Ok(()) +} + +#[cfg(feature = "kiwi")] +pub fn sandbox_lower_token() -> Result<()> { + if let Some(mut target) = sandbox::TargetServices::get() + .exit_context(Exit::SandboxError, "sandbox operation failed")? + { + target.lower_token(); + } + Ok(()) +} + +fn report_dll_loaded(dll_name: String) { + let mut dll_load_details = EmulatorDllDetails::new(); + dll_load_details.set_dll_base_name(dll_name); + let mut details = RecordDetails::new(); + details.set_emulator_dll_details(dll_load_details); + metrics::log_event_with_details(MetricEventType::DllLoaded, &details); +} + +pub fn get_library_watcher( +) -> std::io::Result> { + let mut dlls: HashSet = HashSet::new(); + DllWatcher::new( + move |data| { + info!("DLL loaded: {:?}", data.base_dll_name); + if !dlls.insert(data.base_dll_name.clone()) && metrics::is_initialized() { + report_dll_loaded(data.base_dll_name.to_string_lossy().into_owned()); + } + }, + |data| info!("DLL unloaded: {:?}", data.base_dll_name), + ) +} + +pub(crate) fn start_device(command: DevicesSubcommand) -> Result<()> { + Err(anyhow!("unknown device name: {:?}", command)) +} + +pub(crate) fn run_vm_for_broker(args: Vec) -> Result<()> { + // This is a noop on unix. + initialize_sandbox()?; + let arguments = &[Argument::value( + "bootstrap", + "TRANSPORT_TUBE_RD", + "TubeTransporter descriptor used to bootstrap the main process.", + )]; + + let raw_transport_tube = set_bootstrap_arguments(args, arguments).exit_context( + Exit::InvalidSubCommandArgs, + "error in setting crosvm broker args", + )?; + let exit_state = crate::sys::windows::run_config_for_broker(raw_transport_tube.unwrap()); + crate::to_command_status(exit_state).map(|_| ()) +} + +pub(crate) fn set_bootstrap_arguments( + args: Vec, + arguments: &[Argument], +) -> std::result::Result, argument::Error> { + let mut raw_transport_tube = None; + crate::crosvm::argument::set_arguments(args.iter(), &arguments[..], |name, value| { + if name == "bootstrap" { + raw_transport_tube = Some(value.unwrap().parse::().or(Err( + argument::Error::InvalidValue { + value: value.unwrap().to_string(), + expected: String::from("a raw descriptor integer"), + }, + ))? as RawDescriptor); + } + Ok(()) + }) + .expect("Failed to set bootstrap arguments"); + Ok(raw_transport_tube) +} + +pub(crate) fn cleanup() { + // We've already cleaned everything up by waiting for all the vcpu threads on windows. + // TODO: b/142733266. When we sandbox each device, have a way to terminate the other sandboxed processes. +} + +fn run_broker(cmd: RunCommand) -> Result<()> { + match TryInto::::try_into(cmd) { + Ok(cfg) => run_broker_impl(cfg), + Err(e) => Err(anyhow!("{}", e)), + } +} + +pub(crate) fn run_command(cmd: Commands) -> anyhow::Result<()> { + match cmd { + Commands::RunMetrics(cmd) => run_metrics(cmd.args), + Commands::RunMP(cmd) => { + let mut x: Vec<&str> = vec![]; + for s in cmd.args.iter() { + if s == "backend=win_audio" { + x.push(&s.as_str()); + continue; + } + match s.split_once('=') { + Some((k, v)) => { + x.push(&k); + x.push(&v); + } + None => x.push(s.as_str()), + } + } + let cmd = RunCommand::from_args(&["run-mp"], &x); + match cmd { + Ok(cmd) => run_broker(cmd), + Err(e) => Err(anyhow!("Failed to create config: {:?}", e)), + } + } + Commands::RunMain(cmd) => run_vm_for_broker(cmd.args), + #[cfg(feature = "slirp")] + Commands::RunSlirp(cmd) => run_slirp(cmd.args), + } +} + +pub(crate) fn init_log(log_config: LogConfig, cfg: &Config) -> Result<()> +where + F: Fn(&mut base::syslog::fmt::Formatter, &log::Record<'_>) -> std::io::Result<()> + Sync + Send, +{ + if let Err(e) = syslog::init_with(LogConfig { + proc_name: if let Some(ref tag) = cfg.syslog_tag { + tag.to_string() + } else { + String::from("crosvm") + }, + pipe: if let Some(log_file_path) = &cfg.log_file { + let file = OpenOptions::new() + .create(true) + .append(true) + .open(log_file_path) + .with_exit_context(Exit::LogFile, || { + format!("failed to open log file {}", log_file_path) + })?; + Some(Box::new(file)) + } else { + None + }, + stderr: if cfg.log_file.is_some() { false } else { true }, + ..log_config + }) { + eprintln!("failed to initialize syslog: {}", e); + return Err(anyhow!("failed to initialize syslog: {}", e)); + } + Ok(()) +} + +pub(crate) fn error_to_exit_code(res: &std::result::Result) -> i32 { + res.to_exit_code().unwrap_or(Exit::UnknownError.into()) +} diff --git a/src/sys/windows/metrics.rs b/src/sys/windows/metrics.rs new file mode 100644 index 0000000000..3a29a3bbd6 --- /dev/null +++ b/src/sys/windows/metrics.rs @@ -0,0 +1,93 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +cfg_if::cfg_if! { + if #[cfg(feature = "kiwi")] { + extern crate metrics as metrics_crate; + use anyhow::{Context}; + use broker_ipc::{common_child_setup, CommonChildStartupArgs}; + use base::Tube; + use std::thread; + use metrics_crate::MetricsController; + use crate::crosvm::sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}; + use crate::sys::windows::main::set_bootstrap_arguments; + use tube_transporter::{TubeToken, TubeTransporterReader}; + use base::FromRawDescriptor; + } +} + +#[cfg(feature = "kiwi")] +use crate::crosvm::argument::Argument; +use anyhow::Result; +pub(crate) use metrics::{ + get_destructor, log_descriptor, merge_session_invariants, set_auth_token, set_package_name, + MetricEventType, +}; + +pub(crate) fn run_metrics(#[allow(unused_variables)] args: Vec) -> Result<()> { + #[cfg(not(feature = "kiwi"))] + return Ok(()); + + #[cfg(feature = "kiwi")] + { + let arguments = &[Argument::value( + "bootstrap", + "TRANSPORT_TUBE_RD", + "TubeTransporter descriptor used to bootstrap the metrics process.", + )]; + + let raw_transport_tube = set_bootstrap_arguments(args, arguments).exit_context( + Exit::InvalidSubCommandArgs, + "error in setting crosvm metrics controller args", + )?; + + // Safe because we know that raw_transport_tube is valid (passed by inheritance), and that the + // blocking & framing modes are accurate because we create them ourselves in the broker. + let tube_transporter = + unsafe { TubeTransporterReader::from_raw_descriptor(raw_transport_tube.unwrap()) }; + + let mut tube_data_list = tube_transporter + .read_tubes() + .exit_context(Exit::TubeTransporterInit, "failed to initialize tube")?; + + let bootstrap_tube = tube_data_list.get_tube(TubeToken::Bootstrap).unwrap(); + + let startup_args: CommonChildStartupArgs = + bootstrap_tube.recv::().unwrap(); + let _child_cleanup = common_child_setup(startup_args).exit_context( + Exit::CommonChildSetupError, + "failed to perform common child setup", + )?; + + let metrics_tubes = bootstrap_tube.recv::>().unwrap(); + + tracing::init(); + crate::sys::sandbox_lower_token()?; + + let mut metrics_controller = MetricsController::new(metrics_tubes); + metrics_controller + .run() + .exit_context(Exit::MetricsController, "metrics controller failed") + } +} + +pub(crate) fn setup_metrics_reporting() -> Result<()> { + #[cfg(not(feature = "kiwi"))] + return Ok(()); + + #[cfg(feature = "kiwi")] + { + let (metrics_controller_tube, metrics_agent_tube) = + Tube::pair().exit_context(Exit::CreateTube, "failed to create tube")?; + thread::spawn(move || { + let mut metrics_controller = MetricsController::new(vec![metrics_controller_tube]); + metrics_controller + .run() + .context("metrics controller failed") + .unwrap(); + }); + metrics::initialize(metrics_agent_tube); + Ok(()) + } +} diff --git a/src/sys/windows/panic_hook.rs b/src/sys/windows/panic_hook.rs new file mode 100644 index 0000000000..1d4ff5d91f --- /dev/null +++ b/src/sys/windows/panic_hook.rs @@ -0,0 +1,26 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use std::panic; +use std::process::abort; + +use crate::metrics; + +/// The intent of our panic hook is to get panic info and a stacktrace into the syslog, even for +/// jailed subprocesses. It will always abort on panic to ensure a minidump is generated. +/// +/// Note that jailed processes will usually have a stacktrace of because the backtrace +/// routines attempt to open this binary and are unable to do so in a jail. +pub fn set_panic_hook() { + let default_panic = panic::take_hook(); + panic::set_hook(Box::new(move |info| { + // Ensure all in-flight metrics are fully flushed + metrics::get_destructor().cleanup(); + // TODO(b/144724919): should update log_panic_info for this "cleanly exit crosvm" bug + // log_panic_info(default_panic.as_ref(), info); + default_panic(info); + // Abort to trigger the crash reporter so that a minidump is generated. + abort(); + })); +} diff --git a/src/sys/windows/run_vcpu.rs b/src/sys/windows/run_vcpu.rs new file mode 100644 index 0000000000..711d46101e --- /dev/null +++ b/src/sys/windows/run_vcpu.rs @@ -0,0 +1,922 @@ +// Copyright 2022 The ChromiumOS Authors. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +use anyhow::{anyhow, Result}; +use arch::{self, LinuxArch, RunnableLinuxVm, VcpuAffinity}; +use base::{ + self, error, info, set_audio_thread_priorities, set_cpu_affinity, warn, Event, + Result as BaseResult, SafeMultimediaHandle, SendTube, Timer, Tube, VmEventType, +}; +use std::{ + arch::x86_64::{__cpuid, __cpuid_count}, + fmt::Display, +}; + +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +use { + aarch64::AArch64 as Arch, + devices::{IrqChip, IrqChipAArch64 as IrqChipArch}, + hypervisor::{VcpuAArch64 as VcpuArch, VmAArch64 as VmArch}, +}; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use { + devices::IrqChipX86_64 as IrqChipArch, + hypervisor::{VcpuX86_64 as VcpuArch, VmX86_64 as VmArch}, + x86_64::{adjust_cpuid, CpuIdContext, X8664arch as Arch}, +}; + +use crate::bail_exit_code; +use crate::crosvm::sys::windows::exit::{Exit, ExitContext, ExitContextAnyhow}; +use crate::crosvm::sys::windows::stats::{StatisticsCollector, VmExitStatistics}; +use crate::sys::windows::save_vcpu_tsc_offset; +use cros_async::{select2, EventAsync, Executor, SelectResult, TimerAsync}; +use devices::{Bus, TscSyncMitigations, VcpuRunState}; +use futures::pin_mut; +#[cfg(feature = "whpx")] +use hypervisor::whpx::WhpxVcpu; +use hypervisor::{ + HypervisorCap, IoEventAddress, IoOperation, IoParams, VcpuExit, VcpuInitX86_64, VcpuRunHandle, +}; +use std::convert::TryInto; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Barrier}; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; +use std::{fmt, thread}; +use sync::{Condvar, Mutex}; +use tracing::trace_event; +use vm_control::VmRunMode; +use winapi::shared::winerror::ERROR_RETRY; + +use crate::sys::windows::ExitState; + +const ERROR_RETRY_I32: i32 = ERROR_RETRY as i32; + +#[derive(Default)] +pub struct VcpuRunMode { + mtx: Mutex, + cvar: Condvar, +} + +impl VcpuRunMode { + pub fn set_and_notify(&self, new_mode: VmRunMode) { + *self.mtx.lock() = new_mode; + self.cvar.notify_all(); + } +} + +struct RunnableVcpuInfo { + vcpu: V, + thread_priority_handle: Option, + vcpu_run_handle: VcpuRunHandle, +} + +#[derive(Clone, Debug)] +struct VcpuMonitoringMetadata { + pub start_instant: Instant, + // Milliseconds since the baseline start_instant + pub last_run_time: Arc, + pub last_exit_snapshot: Arc>>, +} + +#[derive(Clone, Debug)] +struct VcpuRunThread { + pub cpu_id: usize, + pub monitoring_metadata: Option, +} + +impl VcpuRunThread { + pub fn new(cpu_id: usize, enable_vcpu_monitoring: bool) -> VcpuRunThread { + VcpuRunThread { + cpu_id, + monitoring_metadata: enable_vcpu_monitoring.then(|| VcpuMonitoringMetadata { + start_instant: Instant::now(), + last_run_time: Arc::new(AtomicU64::new(0)), + last_exit_snapshot: Arc::new(Mutex::new(Option::None)), + }), + } + } + + /// Perform WHPX-specific vcpu configurations + #[cfg(feature = "whpx")] + fn whpx_configure_vcpu(vcpu: &mut dyn VcpuArch, irq_chip: &mut dyn IrqChipArch) { + // only apply to actual WhpxVcpu instances + if let Some(whpx_vcpu) = vcpu.downcast_mut::() { + // WhpxVcpu instances need to know the TSC and Lapic frequencies to handle Hyper-V MSR reads + // and writes. + let tsc_freq = devices::tsc_frequency() + .map_err(|e| { + error!( + "Could not determine TSC frequency, WHPX vcpu will not be configured with \ + a TSC Frequency: {e}" + ); + e + }) + .ok(); + whpx_vcpu.set_frequencies(tsc_freq, irq_chip.lapic_frequency()); + } + } + + // Sets up a vcpu and converts it into a runnable vcpu. + fn runnable_vcpu( + cpu_id: usize, + vcpu: Option, + vcpu_init: VcpuInitX86_64, + vm: &impl VmArch, + irq_chip: &mut dyn IrqChipArch, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Option>, + no_smt: bool, + has_bios: bool, + host_cpu_topology: bool, + force_calibrated_tsc_leaf: bool, + ) -> Result> + where + V: VcpuArch, + { + let mut vcpu = match vcpu { + Some(v) => v, + None => { + // If vcpu is None, it means this arch/hypervisor requires create_vcpu to be called from + // the vcpu thread. + match vm + .create_vcpu(cpu_id) + .exit_context(Exit::CreateVcpu, "failed to create vcpu")? + .downcast::() + { + Ok(v) => *v, + Err(_) => panic!("VM created wrong type of VCPU"), + } + } + }; + + irq_chip + .add_vcpu(cpu_id, &vcpu) + .exit_context(Exit::AddIrqChipVcpu, "failed to add vcpu to irq chip")?; + + if let Some(affinity) = vcpu_affinity { + if let Err(e) = set_cpu_affinity(affinity) { + error!("Failed to set CPU affinity: {}", e); + } + } + + Arch::configure_vcpu( + vm, + vm.get_hypervisor(), + irq_chip, + &mut vcpu, + vcpu_init, + cpu_id, + vcpu_count, + has_bios, + no_smt, + host_cpu_topology, + /* enable_pnp_data */ false, + /* itmt */ false, + force_calibrated_tsc_leaf, + ) + .exit_context(Exit::ConfigureVcpu, "failed to configure vcpu")?; + + #[cfg(feature = "whpx")] + Self::whpx_configure_vcpu(&mut vcpu, irq_chip); + + let mut thread_priority_handle = None; + if run_rt { + // Until we are multi process on Windows, we can't use the normal thread priority APIs; + // instead, we use a trick from the audio device which is able to set a thread RT even + // though the process itself is not RT. + thread_priority_handle = match set_audio_thread_priorities() { + Ok(hndl) => Some(hndl), + Err(e) => { + warn!("Failed to set vcpu thread to real time priority: {}", e); + None + } + }; + } + + let vcpu_run_handle = vcpu + .take_run_handle(None) + .exit_context(Exit::RunnableVcpu, "failed to set thread id for vcpu")?; + + Ok(RunnableVcpuInfo { + vcpu, + thread_priority_handle, + vcpu_run_handle, + }) + } + + pub fn run( + &self, + vcpu: Option, + vcpu_init: VcpuInitX86_64, + vcpus: Arc>>>, + vm: impl VmArch + 'static, + mut irq_chip: Box, + vcpu_count: usize, + run_rt: bool, + vcpu_affinity: Option>, + delay_rt: bool, + no_smt: bool, + start_barrier: Arc, + vcpu_create_barrier: Arc, + has_bios: bool, + mut io_bus: devices::Bus, + mut mmio_bus: devices::Bus, + vm_evt_wrtube: SendTube, + requires_pvclock_ctrl: bool, + run_mode_arc: Arc, + stats: Option>>, + host_cpu_topology: bool, + tsc_offset: Option, + force_calibrated_tsc_leaf: bool, + ) -> Result>> + where + V: VcpuArch + 'static, + { + let context = self.clone(); + thread::Builder::new() + .name(format!("crosvm_vcpu{}", self.cpu_id)) + .spawn(move || { + // Having a closure returning ExitState guarentees that we + // send a VmEventType on all code paths after the closure + // returns. + let vcpu_fn = || -> Result { + let runnable_vcpu = Self::runnable_vcpu( + context.cpu_id, + vcpu, + vcpu_init, + &vm, + irq_chip.as_mut(), + vcpu_count, + run_rt && !delay_rt, + vcpu_affinity, + no_smt, + has_bios, + host_cpu_topology, + force_calibrated_tsc_leaf, + ); + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + let cpuid_context = CpuIdContext::new( + context.cpu_id, + vcpu_count, + no_smt, + host_cpu_topology, + Some(irq_chip.as_ref()), + /* enable_pnp_data */ false, + /* itmt */ false, + force_calibrated_tsc_leaf, + vm.get_hypervisor() + .check_capability(HypervisorCap::CalibratedTscLeafRequired), + __cpuid_count, + __cpuid, + ); + + // The vcpu_create_barrier is supplied from the main thread in order for it to + // wait until this thread is done creating its vcpu. + vcpu_create_barrier.wait(); + + // Wait for this barrier before continuing forward. + start_barrier.wait(); + + let RunnableVcpuInfo { + vcpu, + thread_priority_handle: _thread_priority_handle, + vcpu_run_handle, + } = runnable_vcpu?; + + if let Some(offset) = tsc_offset { + vcpu.set_tsc_offset(offset).unwrap_or_else(|e| { + error!( + "Failed to set tsc_offset of {} on vcpu {}: {}", + offset, context.cpu_id, e + ) + }); + } + + // Clone vcpu so it can be used by the main thread to force a vcpu run to exit + vcpus + .lock() + .push(Box::new(vcpu.try_clone().expect("Could not clone vcpu!"))); + + mmio_bus.set_access_id(context.cpu_id); + io_bus.set_access_id(context.cpu_id); + + vcpu_loop( + &context, + vcpu, + vm, + vcpu_run_handle, + irq_chip, + io_bus, + mmio_bus, + requires_pvclock_ctrl, + run_mode_arc, + stats, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + cpuid_context, + ) + }; + + let final_event_data = match vcpu_fn().unwrap_or_else(|e| { + error!("vcpu {} run loop exited with error: {}", context.cpu_id, e); + ExitState::Stop + }) { + ExitState::Stop => VmEventType::Exit, + _ => unreachable!(), + }; + vm_evt_wrtube + .send::(&final_event_data) + .unwrap_or_else(|e| { + error!( + "failed to send final event {:?} on vcpu {}: {}", + final_event_data, context.cpu_id, e + ) + }); + Ok(()) + }) + .exit_context(Exit::SpawnVcpu, "failed to spawn VCPU thread") + } +} + +#[derive(Clone, Debug)] +struct VcpuExitData { + // Represented by duration since baseline start_instant + exit_time: Duration, + exit_result: BaseResult, +} + +impl Display for VcpuExitData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "exit result: {:?}", self.exit_result) + } +} + +struct VcpuStallMonitor { + vcpu_run_threads: Vec, + run_mode: Arc, +} + +impl VcpuStallMonitor { + const HOST_STALL_TIMEOUT: Duration = Duration::from_secs(2); + const VCPU_CHECKUP_INTERVAL: Duration = Duration::from_secs(1); + const STALL_REPORTING_LIMITER: Duration = Duration::from_secs(10); + + pub fn init(run_mode: Arc) -> VcpuStallMonitor { + VcpuStallMonitor { + vcpu_run_threads: vec![], + run_mode, + } + } + + pub fn add_vcpu_thread(&mut self, thread: VcpuRunThread) { + self.vcpu_run_threads.push(thread); + } + + pub fn run(self, exit_event: &Event) -> Result>> { + let cloned_exit_event = exit_event + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?; + thread::Builder::new() + .name("crosvm_vcpu_stall_monitor".to_string()) + .spawn(move || { + let ex = Executor::new()?; + + let mut timer = TimerAsync::new(Timer::new()?, &ex)?; + let mut reset_timer = true; + + let exit_evt_async = EventAsync::new(cloned_exit_event, &ex)?; + let exit_future = exit_evt_async.next_val(); + pin_mut!(exit_future); + 'main: loop { + if reset_timer { + timer.reset( + Self::VCPU_CHECKUP_INTERVAL, + Some(Self::VCPU_CHECKUP_INTERVAL), + )?; + reset_timer = false; + } + let timer_future = timer.next_val(); + pin_mut!(timer_future); + match ex.run_until(select2(timer_future, exit_future)) { + Ok((timer_result, exit_result)) => { + match exit_result { + SelectResult::Finished(_) => { + info!("vcpu monitor got exit event"); + break 'main; + } + SelectResult::Pending(future) => exit_future = future, + } + + match timer_result { + SelectResult::Finished(Err(e)) => { + error!( + "vcpu monitor aborting due to error awaiting future: {}", + e + ); + break 'main; + } + SelectResult::Finished(_) => self.report_any_stalls(), + _ => (), + } + } + Err(e) => { + error!("vcpu monitor failed to wait on future set: {:?}", e); + break 'main; + } + } + + // Always ensure the vcpus aren't suspended before continuing to montior. + let mut run_mode_lock = self.run_mode.mtx.lock(); + loop { + match *run_mode_lock { + VmRunMode::Running => break, + VmRunMode::Suspending | VmRunMode::Breakpoint => { + info!("vcpu monitor pausing until end of suspension"); + run_mode_lock = self.run_mode.cvar.wait(run_mode_lock); + reset_timer = true; + } + VmRunMode::Exiting => { + info!("vcpu monitor detected vm exit"); + break 'main; + } + } + } + } + + Ok(()) + }) + .exit_context( + Exit::SpawnVcpuMonitor, + "failed to spawn VCPU stall monitor thread", + ) + } + + fn report_any_stalls(&self) { + // TODO(b/208267651): Add and fire Clearcut events for stalls (and add tests) + // TODO(b/208267651): Also test guest stalls (vcpu.run() goes too long without exiting) + let now = Instant::now(); + for vcpu_thread in self.vcpu_run_threads.iter() { + let monitoring_metadata = vcpu_thread.monitoring_metadata.as_ref().unwrap(); + if let Some(ref exit_snapshot) = monitoring_metadata.last_exit_snapshot.lock().clone() { + let last_run = + Duration::from_millis(monitoring_metadata.last_run_time.load(Ordering::SeqCst)); + if last_run < exit_snapshot.exit_time { + // VCPU is between runs + let time_since_exit = now.saturating_duration_since( + monitoring_metadata.start_instant + exit_snapshot.exit_time, + ); + if time_since_exit > Self::HOST_STALL_TIMEOUT { + self.report_stall(vcpu_thread.cpu_id, exit_snapshot, time_since_exit); + } + } + }; + } + } + + fn report_stall(&self, cpu_id: usize, exit_data: &VcpuExitData, stall_time: Duration) { + if stall_time > Self::STALL_REPORTING_LIMITER { + return; + } + // Double check the Vm is running. We don't care about stalls during suspension/exit + if *self.run_mode.mtx.lock() != VmRunMode::Running { + let duration_string = format!("{:.1}sec", stall_time.as_secs_f32()); + error!( + "Host stall for {} on VCPU {} exit while handling: {}", + duration_string, cpu_id, exit_data, + ); + } + } +} + +fn setup_vcpu_signal_handler() -> Result<()> { + Ok(()) +} + +pub fn run_all_vcpus( + vcpus: Vec>, + vcpu_boxes: Arc>>>, + guest_os: &RunnableLinuxVm, + exit_evt: &Event, + vm_evt_wrtube: &SendTube, + pvclock_host_tube: &Option, + stats: &Option>>, + host_cpu_topology: bool, + run_mode_arc: Arc, + tsc_sync_mitigations: TscSyncMitigations, + force_calibrated_tsc_leaf: bool, +) -> Result>>> { + let mut vcpu_threads = Vec::with_capacity(guest_os.vcpu_count + 1); + let start_barrier = Arc::new(Barrier::new(guest_os.vcpu_count + 1)); + let enable_vcpu_monitoring = anti_tamper::enable_vcpu_monitoring(); + setup_vcpu_signal_handler()?; + + let mut stall_monitor = + enable_vcpu_monitoring.then(|| VcpuStallMonitor::init(run_mode_arc.clone())); + for (cpu_id, vcpu) in vcpus.into_iter().enumerate() { + let vcpu_affinity = match guest_os.vcpu_affinity.clone() { + Some(VcpuAffinity::Global(v)) => Some(v), + Some(VcpuAffinity::PerVcpu(mut m)) => Some(m.remove(&cpu_id).unwrap_or_default()), + None => None, + }; + + // TSC sync mitigations may set vcpu affinity and set a TSC offset + let (vcpu_affinity, tsc_offset): (Option>, Option) = + if let Some(mitigation_affinity) = tsc_sync_mitigations.get_vcpu_affinity(cpu_id) { + if vcpu_affinity.is_none() { + ( + Some(mitigation_affinity), + tsc_sync_mitigations.get_vcpu_tsc_offset(cpu_id), + ) + } else { + error!( + "Core affinity {:?} specified via commandline conflicts and overrides \ + affinity needed for TSC sync mitigation: {:?}.", + vcpu_affinity, mitigation_affinity + ); + (vcpu_affinity, None) + } + } else { + (vcpu_affinity, None) + }; + + let vcpu_init = &guest_os.vcpu_init[cpu_id]; + // The vcpu_create_barrier allows the main thread to delay the spawning of additional + // vcpu threads until a single vcpu thread spawned has finished creating it's vcpu. + // We currently use this to allow creation of 1 vcpu at a time for all hypervisors. + // There are issues with multiple hypervisors with this approach: + // - Windows 11 has a regression which causes a BSOD with creation of multiple vcpu + // in parallel. http://b/229635845 for more details. + // - GHAXM/HAXM cannot create vcpu0 in parallel with other Vcpus. + let vcpu_create_barrier = Arc::new(Barrier::new(2)); + let vcpu_run_thread = VcpuRunThread::new(cpu_id, enable_vcpu_monitoring); + let join_handle = vcpu_run_thread.run( + vcpu, + vcpu_init.clone(), + vcpu_boxes.clone(), + guest_os + .vm + .try_clone() + .exit_context(Exit::CloneEvent, "failed to clone vm")?, + guest_os + .irq_chip + .try_box_clone() + .exit_context(Exit::CloneEvent, "failed to clone event")?, + guest_os.vcpu_count, + guest_os.rt_cpus.contains(&cpu_id), + vcpu_affinity, + guest_os.delay_rt, + guest_os.no_smt, + start_barrier.clone(), + vcpu_create_barrier.clone(), + guest_os.has_bios, + (*guest_os.io_bus).clone(), + (*guest_os.mmio_bus).clone(), + vm_evt_wrtube + .try_clone() + .exit_context(Exit::CloneTube, "failed to clone tube")?, + pvclock_host_tube.is_none(), + run_mode_arc.clone(), + stats.clone(), + host_cpu_topology, + tsc_offset, + force_calibrated_tsc_leaf, + )?; + if let Some(ref mut monitor) = stall_monitor { + monitor.add_vcpu_thread(vcpu_run_thread); + } + + // Wait until the vcpu is created before we start a new vcpu thread + vcpu_create_barrier.wait(); + + vcpu_threads.push(join_handle); + } + if let Some(monitor) = stall_monitor { + vcpu_threads.push(monitor.run(exit_evt)?); + } + // Now wait on the start barrier to start all threads at the same time. + start_barrier.wait(); + Ok(vcpu_threads) +} + +fn vcpu_loop( + context: &VcpuRunThread, + mut vcpu: V, + vm: impl VmArch + 'static, + vcpu_run_handle: VcpuRunHandle, + irq_chip: Box, + mut io_bus: Bus, + mut mmio_bus: Bus, + requires_pvclock_ctrl: bool, + run_mode_arc: Arc, + stats: Option>>, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] cpuid_context: CpuIdContext, +) -> Result +where + V: VcpuArch + 'static, +{ + let mut exit_stats = VmExitStatistics::new(); + mmio_bus.stats.set_enabled(stats.is_some()); + io_bus.stats.set_enabled(stats.is_some()); + exit_stats.set_enabled(stats.is_some()); + + let mut save_tsc_offset = true; + + loop { + let _trace_event = trace_event!(crosvm, "vcpu loop"); + let mut check_vm_shutdown = false; + + match irq_chip.wait_until_runnable(&vcpu).with_exit_context( + Exit::WaitUntilRunnable, + || { + format!( + "error waiting for vcpu {} to become runnable", + context.cpu_id + ) + }, + )? { + VcpuRunState::Runnable => {} + VcpuRunState::Interrupted => check_vm_shutdown = true, + } + + if !check_vm_shutdown { + let exit = { + let _trace_event = trace_event!(crosvm, "vcpu::run"); + if let Some(ref monitoring_metadata) = context.monitoring_metadata { + monitoring_metadata.last_run_time.store( + // Safe conversion because millis will always be < u32::MAX + monitoring_metadata + .start_instant + .elapsed() + .as_millis() + .try_into() + .unwrap(), + Ordering::SeqCst, + ); + } + vcpu.run(&vcpu_run_handle) + }; + if let Some(ref monitoring_metadata) = context.monitoring_metadata { + *monitoring_metadata.last_exit_snapshot.lock() = Some(VcpuExitData { + exit_time: monitoring_metadata.start_instant.elapsed(), + exit_result: exit, + }); + } + + // save the tsc offset if we need to + if save_tsc_offset { + if let Ok(offset) = vcpu.get_tsc_offset() { + save_vcpu_tsc_offset(offset, context.cpu_id); + } else { + error!("Unable to determine TSC offset"); + } + save_tsc_offset = false; + } + + let start = exit_stats.start_stat(); + + match exit { + Ok(VcpuExit::Io) => { + let _trace_event = trace_event!(crosvm, "VcpuExit::Io"); + vcpu.handle_io(&mut |IoParams { address, mut size, operation}| { + match operation { + IoOperation::Read => { + let mut data = [0u8; 8]; + if size > data.len() { + error!("unsupported IoIn size of {} bytes", size); + size = data.len(); + } + io_bus.read(address, &mut data[..size]); + Some(data) + } + IoOperation::Write { data } => { + if size > data.len() { + error!("unsupported IoOut size of {} bytes", size); + size = data.len() + } + vm.handle_io_events(IoEventAddress::Pio(address), &data[..size]) + .unwrap_or_else(|e| error!( + "failed to handle ioevent for pio write to {} on vcpu {}: {}", + address, context.cpu_id, e + )); + io_bus.write(address, &data[..size]); + None + } + } + }).unwrap_or_else(|e| error!("failed to handle io: {}", e)); + } + Ok(VcpuExit::Mmio) => { + let _trace_event = trace_event!(crosvm, "VcpuExit::Mmio"); + vcpu.handle_mmio(&mut |IoParams { address, mut size, operation }| { + match operation { + IoOperation::Read => { + let mut data = [0u8; 8]; + if size > data.len() { + error!("unsupported MmioRead size of {} bytes", size); + size = data.len(); + } + { + let data = &mut data[..size]; + if !mmio_bus.read(address, data) { + info!( + "mmio read failed: {:x}; trying memory read..", + address + ); + vm.get_memory() + .read_exact_at_addr( + data, + vm_memory::GuestAddress(address), + ) + .unwrap_or_else(|e| { + error!( + "guest memory read failed at {:x}: {}", + address, e + ) + }); + } + } + Some(data) + } + IoOperation::Write { data } => { + if size > data.len() { + error!("unsupported MmioWrite size of {} bytes", size); + size = data.len() + } + let data = &data[..size]; + vm.handle_io_events(IoEventAddress::Mmio(address), data) + .unwrap_or_else(|e| error!( + "failed to handle ioevent for mmio write to {} on vcpu {}: {}", + address, context.cpu_id, e + )); + if !mmio_bus.write(address, data) { + info!( + "mmio write failed: {:x}; trying memory write..", + address + ); + vm.get_memory() + .write_all_at_addr(data, vm_memory::GuestAddress(address)) + .unwrap_or_else(|e| error!( + "guest memory write failed at {:x}: {}", + address, e + )); + } + None + } + } + }).unwrap_or_else(|e| error!("failed to handle mmio: {}", e)); + } + Ok(VcpuExit::IoapicEoi { vector }) => { + irq_chip.broadcast_eoi(vector).unwrap_or_else(|e| { + error!( + "failed to broadcast eoi {} on vcpu {}: {}", + vector, context.cpu_id, e + ) + }); + } + Ok(VcpuExit::IrqWindowOpen) => {} + Ok(VcpuExit::Hlt) => irq_chip.halted(context.cpu_id), + + // VcpuExit::Shutdown is always an error on Windows. HAXM exits with + // Shutdown only for triple faults and other vcpu panics. WHPX never exits + // with Shutdown. Normal reboots and shutdowns, like window close, use + // the vm event tube and VmRunMode::Exiting instead of VcpuExit::Shutdown. + Ok(VcpuExit::Shutdown) => bail_exit_code!(Exit::VcpuShutdown, "vcpu shutdown"), + Ok(VcpuExit::FailEntry { + hardware_entry_failure_reason, + }) => bail_exit_code!( + Exit::VcpuFailEntry, + "vcpu hw run failure: {:#x}", + hardware_entry_failure_reason, + ), + Ok(VcpuExit::SystemEventShutdown) => { + bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventShutdown") + } + Ok(VcpuExit::SystemEventReset) => { + bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventReset") + } + Ok(VcpuExit::SystemEventCrash) => { + bail_exit_code!(Exit::VcpuSystemEvent, "vcpu SystemEventCrash") + } + + // When we're shutting down (e.g., emulator window gets closed), GVM vmexits + // with KVM_EXIT_INTR, which vcpu.run maps to VcpuExit::Intr. But KVM_EXIT_INTR + // can happen during normal operation too, when GVM's timer finds requests + // pending from the host. So we set check_vm_shutdown, then below check the + // VmRunMode state to see if we should exit the run loop. + Ok(VcpuExit::Intr) => check_vm_shutdown = true, + Ok(VcpuExit::Canceled) => check_vm_shutdown = true, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Ok(VcpuExit::Cpuid { mut entry }) => { + let _trace_event = trace_event!(crosvm, "VcpuExit::Cpuid"); + // adjust the results based on crosvm logic + adjust_cpuid(&mut entry, &cpuid_context); + + // let the vcpu finish handling the exit + vcpu.handle_cpuid(&entry).unwrap_or_else(|e| { + error!( + "failed to handle setting cpuid results on cpu {}: {}", + context.cpu_id, e + ) + }); + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Ok(VcpuExit::MsrAccess) => {} // MsrAccess handled by hypervisor impl + Ok(r) => { + error!("unexpected vcpu.run return value: {:?}", r); + check_vm_shutdown = true; + } + Err(e) => match e.errno() { + ERROR_RETRY_I32 => {} + _ => { + run_mode_arc.set_and_notify(VmRunMode::Exiting); + Err(e).exit_context(Exit::VcpuRunError, "vcpu run error")?; + } + }, + } + + exit_stats.end_stat(&exit, start); + } + + if check_vm_shutdown { + let mut run_mode_lock = run_mode_arc.mtx.lock(); + loop { + match *run_mode_lock { + VmRunMode::Running => break, + VmRunMode::Suspending => { + // On KVM implementations that use a paravirtualized clock (e.g. + // x86), a flag must be set to indicate to the guest kernel that + // a VCPU was suspended. The guest kernel will use this flag to + // prevent the soft lockup detection from triggering when this + // VCPU resumes, which could happen days later in realtime. + if requires_pvclock_ctrl { + vcpu.pvclock_ctrl().unwrap_or_else(|e| error!( + "failed to signal to hypervisor that vcpu {} is being suspended: {}", + context.cpu_id, e + )); + } + } + VmRunMode::Breakpoint => {} + VmRunMode::Exiting => { + if let Some(stats) = stats { + let mut collector = stats.lock(); + collector.pio_bus_stats.push(io_bus.stats); + collector.mmio_bus_stats.push(mmio_bus.stats); + collector.vm_exit_stats.push(exit_stats); + } + return Ok(ExitState::Stop); + } + } + // Give ownership of our exclusive lock to the condition variable that + // will block. When the condition variable is notified, `wait` will + // unblock and return a new exclusive lock. + run_mode_lock = run_mode_arc.cvar.wait(run_mode_lock); + } + } + + irq_chip.inject_interrupts(&vcpu).unwrap_or_else(|e| { + error!( + "failed to inject interrupts for vcpu {}: {}", + context.cpu_id, e + ) + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct SetupData { + pub monitor: VcpuStallMonitor, + pub exit_evt: Event, + } + + fn set_up_stall_monitor(vcpu_count: usize) -> Result { + let run_mode = Arc::new(VcpuRunMode::default()); + let mut monitor = VcpuStallMonitor::init(run_mode.clone()); + + for id in 0..vcpu_count { + let new_vcpu = VcpuRunThread::new(id, true /* enable_vcpu_monitoring */); + monitor.add_vcpu_thread(new_vcpu); + } + + Ok(SetupData { + monitor, + exit_evt: Event::new().expect("Failed to create event"), + }) + } + + #[test] + fn stall_monitor_closes_on_exit_evt() -> Result<()> { + let SetupData { monitor, exit_evt } = set_up_stall_monitor(1)?; + + let _ = exit_evt.write(1)?; + let _ = monitor + .run(&exit_evt)? + .join() + .unwrap_or_else(|e| panic!("Thread join failed: {:?}", e)); + Ok(()) + } +}