diff --git a/resources/guest_configs/ci.config b/resources/guest_configs/ci.config index 166ab94db3f..f5c9d3a64c0 100644 --- a/resources/guest_configs/ci.config +++ b/resources/guest_configs/ci.config @@ -5,6 +5,7 @@ CONFIG_SQUASHFS_ZSTD=y # aarch64 only TBD split into a separate file CONFIG_DEVMEM=y # CONFIG_ARM64_ERRATUM_3194386 is not set +CONFIG_DMA_RESTRICTED_POOL=y # Needed for CTRL+ALT+DEL support CONFIG_SERIO=y CONFIG_SERIO_I8042=y diff --git a/src/firecracker/src/api_server/request/machine_configuration.rs b/src/firecracker/src/api_server/request/machine_configuration.rs index 2e8addffb74..6dbfdf8d64a 100644 --- a/src/firecracker/src/api_server/request/machine_configuration.rs +++ b/src/firecracker/src/api_server/request/machine_configuration.rs @@ -119,6 +119,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + mem_config: Some(Default::default()), smt: Some(false), cpu_template: None, track_dirty_pages: Some(false), @@ -140,6 +141,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + mem_config: Some(Default::default()), smt: Some(false), cpu_template: Some(StaticCpuTemplate::None), track_dirty_pages: Some(false), @@ -161,6 +163,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + mem_config: Some(Default::default()), smt: Some(false), cpu_template: None, track_dirty_pages: Some(true), @@ -186,6 +189,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + mem_config: Some(Default::default()), smt: Some(false), cpu_template: Some(StaticCpuTemplate::T2), track_dirty_pages: Some(true), @@ -213,6 +217,7 @@ mod tests { let expected_config = MachineConfigUpdate { vcpu_count: Some(8), mem_size_mib: Some(1024), + mem_config: Some(Default::default()), smt: Some(true), cpu_template: None, track_dirty_pages: Some(true), diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 61200cb2148..711672fc110 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -10,7 +10,7 @@ use std::ffi::CString; use std::fmt::Debug; use vm_fdt::{Error as VmFdtError, FdtWriter, FdtWriterNode}; -use vm_memory::GuestMemoryError; +use vm_memory::{GuestMemoryError, GuestMemoryRegion}; use super::super::DeviceType; use super::cache_info::{CacheEntry, read_cache_config}; @@ -18,12 +18,15 @@ use super::gic::GICDevice; use crate::device_manager::mmio::MMIODeviceInfo; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; -use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; +use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, KvmRegion}; // This is a value for uniquely identifying the FDT node declaring the interrupt controller. const GIC_PHANDLE: u32 = 1; // This is a value for uniquely identifying the FDT node containing the clock definition. const CLOCK_PHANDLE: u32 = 2; +/// Unique identifier for the /reserved-memory node describing the memory region the guest +/// is allowed to use for swiotlb +const IO_MEM_PHANDLE: u32 = 3; // You may be wondering why this big value? // This phandle is used to uniquely identify the FDT nodes containing cache information. Each cpu // can have a variable number of caches, some of these caches may be shared with other cpus. @@ -56,8 +59,10 @@ pub enum FdtError { } /// Creates the flattened device tree for this aarch64 microVM. +#[allow(clippy::too_many_arguments)] pub fn create_fdt( guest_mem: &GuestMemoryMmap, + swiotlb_region: Option<&KvmRegion>, vcpu_mpidr: Vec, cmdline: CString, device_info: &HashMap<(DeviceType, String), MMIODeviceInfo>, @@ -83,7 +88,7 @@ pub fn create_fdt( // containing description of the interrupt controller for this VM. fdt_writer.property_u32("interrupt-parent", GIC_PHANDLE)?; create_cpu_nodes(&mut fdt_writer, &vcpu_mpidr)?; - create_memory_node(&mut fdt_writer, guest_mem)?; + create_memory_node(&mut fdt_writer, guest_mem, swiotlb_region)?; create_chosen_node(&mut fdt_writer, cmdline, initrd)?; create_gic_node(&mut fdt_writer, gic_device)?; create_timer_node(&mut fdt_writer)?; @@ -208,7 +213,11 @@ fn create_cpu_nodes(fdt: &mut FdtWriter, vcpu_mpidr: &[u64]) -> Result<(), FdtEr Ok(()) } -fn create_memory_node(fdt: &mut FdtWriter, guest_mem: &GuestMemoryMmap) -> Result<(), FdtError> { +fn create_memory_node( + fdt: &mut FdtWriter, + guest_mem: &GuestMemoryMmap, + swiotlb_region: Option<&KvmRegion>, +) -> Result<(), FdtError> { // See https://github.com/torvalds/linux/blob/master/Documentation/devicetree/booting-without-of.txt#L960 // for an explanation of this. @@ -220,9 +229,13 @@ fn create_memory_node(fdt: &mut FdtWriter, guest_mem: &GuestMemoryMmap) -> Resul // The reason we do this is that Linux does not allow remapping system memory. However, without // remap, kernel drivers cannot get virtual addresses to read data from device memory. Leaving // this memory region out allows Linux kernel modules to remap and thus read this region. + // + // The generic memory description must include the range that we later declare as a reserved + // carve out. let mem_size = guest_mem.last_addr().raw_value() - super::layout::DRAM_MEM_START - super::layout::SYSTEM_MEM_SIZE + + swiotlb_region.map(|r| r.len()).unwrap_or(0) + 1; let mem_reg_prop = &[ super::layout::DRAM_MEM_START + super::layout::SYSTEM_MEM_SIZE, @@ -233,6 +246,25 @@ fn create_memory_node(fdt: &mut FdtWriter, guest_mem: &GuestMemoryMmap) -> Resul fdt.property_array_u64("reg", mem_reg_prop)?; fdt.end_node(mem)?; + if let Some(region) = swiotlb_region { + let rmem = fdt.begin_node("reserved-memory")?; + fdt.property_u32("#address-cells", ADDRESS_CELLS)?; + fdt.property_u32("#size-cells", SIZE_CELLS)?; + fdt.property_null("ranges")?; + let dma = fdt.begin_node("bouncy_boi")?; + fdt.property_string("compatible", "restricted-dma-pool")?; + fdt.property_phandle(IO_MEM_PHANDLE)?; + fdt.property_array_u64("reg", &[region.start_addr().0, region.len()])?; + fdt.end_node(dma)?; + fdt.end_node(rmem)?; + + log::info!( + "Placed swiotlb region at [{}, {})", + region.start_addr().0, + region.start_addr().0 + region.len() + ); + } + Ok(()) } @@ -358,6 +390,9 @@ fn create_virtio_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result< fdt.property_string("compatible", "virtio,mmio")?; fdt.property_array_u64("reg", &[dev_info.addr, dev_info.len])?; + // Force all virtio device to place their vrings and buffer into the dedicated I/O memory region + // that is backed by traditional memory (e.g. not secret-free). + fdt.property_u32("memory-region", IO_MEM_PHANDLE)?; fdt.property_array_u32( "interrupts", &[ @@ -499,6 +534,7 @@ mod tests { let gic = create_gic(&vm, 1, None).unwrap(); create_fdt( &mem, + None, vec![0], CString::new("console=tty0").unwrap(), &dev_info, @@ -519,6 +555,7 @@ mod tests { let gic = create_gic(&vm, 1, None).unwrap(); create_fdt( &mem, + None, vec![0], CString::new("console=tty0").unwrap(), &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), @@ -544,6 +581,7 @@ mod tests { let current_dtb_bytes = create_fdt( &mem, + None, vec![0], CString::new("console=tty0").unwrap(), &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), @@ -606,6 +644,7 @@ mod tests { let current_dtb_bytes = create_fdt( &mem, + None, vec![0], CString::new("console=tty0").unwrap(), &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index ead827c08c4..85bfcdf529e 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -89,6 +89,14 @@ pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usi )] } +/// How many bytes of physical guest memory are addressible before the final gap in +/// the address space on this architecture. +/// +/// There are no architectural gaps in the physical address space on aarch64, so this is 0 +pub fn bytes_before_last_gap() -> usize { + 0 +} + /// Configures the system for booting Linux. pub fn configure_system_for_boot( vmm: &mut Vmm, @@ -130,8 +138,16 @@ pub fn configure_system_for_boot( .as_cstring() .expect("Cannot create cstring from cmdline string"); + let swiotlb_region = match vmm.vm.swiotlb_regions().num_regions() { + 0 | 1 => vmm.vm.swiotlb_regions().iter().next(), + _ => panic!( + "Firecracker tried to configure more than one swiotlb region. This is a logic bug." + ), + }; + let fdt = fdt::create_fdt( vmm.vm.guest_memory(), + swiotlb_region, vcpu_mpidr, cmdline, vmm.mmio_device_manager.get_device_info(), diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index e54723f5b6d..1a3d3d94e17 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -70,6 +70,7 @@ impl ArchVm { pub fn save_state(&self, mpidrs: &[u64]) -> Result { Ok(VmState { memory: self.common.guest_memory.describe(), + io_memory: self.common.swiotlb_regions.describe(), gic: self .get_irqchip() .save_device(mpidrs) @@ -96,6 +97,8 @@ impl ArchVm { pub struct VmState { /// Guest memory state pub memory: GuestMemoryState, + /// io memory state + pub io_memory: GuestMemoryState, /// GIC state. pub gic: GicState, } diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 61d65fea1a5..5db5b2c02e0 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -20,7 +20,7 @@ pub use aarch64::vcpu::*; pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, + ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, bytes_before_last_gap, configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::CMDLINE_MAX_SIZE, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, @@ -39,7 +39,7 @@ pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, + ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, bytes_before_last_gap, configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, layout::CMDLINE_MAX_SIZE, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index ca350cbf9af..4de00bd91c5 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -140,6 +140,14 @@ pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usi } } +/// How many bytes of physical guest memory are addressible before the final gap in +/// the address space on this architecture. +/// +/// On x86_64, this is the number of bytes that fit before the MMIO gap. +pub fn bytes_before_last_gap() -> usize { + u64_to_usize(MMIO_MEM_START) +} + /// Returns the memory address where the kernel could be loaded. pub fn get_kernel_start() -> u64 { layout::HIMEM_START diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..b186344cbb2 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -187,6 +187,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), + io_memory: self.common.swiotlb_regions.describe(), pitstate, clock, pic_master, @@ -211,6 +212,8 @@ impl ArchVm { pub struct VmState { /// guest memory state pub memory: GuestMemoryState, + /// io memory state + pub io_memory: GuestMemoryState, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 27ba3e96fa9..fff9afdacf5 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -4,7 +4,7 @@ //! Enables pre-boot setup, instantiation and booting of a Firecracker VMM. use std::fmt::Debug; -use std::io; +use std::io::{self}; #[cfg(feature = "gdb")] use std::sync::mpsc; use std::sync::{Arc, Mutex}; @@ -12,8 +12,8 @@ use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; -use userfaultfd::Uffd; use utils::time::TimestampUs; +use vm_memory::GuestMemoryRegion; #[cfg(target_arch = "aarch64")] use vm_superio::Rtc; use vm_superio::Serial; @@ -50,14 +50,17 @@ use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; use crate::gdb; use crate::initrd::{InitrdConfig, InitrdError}; use crate::logger::{debug, error}; -use crate::persist::{MicrovmState, MicrovmStateError}; +use crate::persist::{ + MicrovmState, MicrovmStateError, RestoreMemoryError, SnapshotStateFromFileError, + restore_memory, send_uffd_handshake, +}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; +use crate::vmm_config::snapshot::{MemBackendConfig, MemBackendType}; use crate::vstate::kvm::Kvm; -use crate::vstate::memory::GuestRegionMmap; use crate::vstate::vcpu::{Vcpu, VcpuError}; use crate::vstate::vm::Vm; use crate::{EventManager, Vmm, VmmError, device_manager}; @@ -215,6 +218,10 @@ pub fn build_microvm_for_boot( .allocate_guest_memory() .map_err(StartMicrovmError::GuestMemory)?; + let swiotlb = vm_resources + .allocate_swiotlb_region() + .map_err(StartMicrovmError::GuestMemory)?; + // Clone the command-line so that a failed boot doesn't pollute the original. #[allow(unused_mut)] let mut boot_cmdline = boot_config.cmdline.clone(); @@ -235,6 +242,12 @@ pub fn build_microvm_for_boot( .register_memory_regions(guest_memory) .map_err(VmmError::Vm)?; + if let Some(swiotlb) = swiotlb { + vmm.vm + .register_swiotlb_region(swiotlb) + .map_err(VmmError::Vm)?; + } + let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; @@ -381,6 +394,8 @@ pub enum BuildMicrovmFromSnapshotError { SetTsc(#[from] crate::arch::SetTscError), /// Failed to restore microVM state: {0} RestoreState(#[from] crate::vstate::vm::ArchVmError), + /// Failed to get snapshot state from file: {0} + LoadState(#[from] SnapshotStateFromFileError), /// Failed to update microVM configuration: {0} VmUpdateConfig(#[from] MachineConfigError), /// Failed to restore MMIO device: {0} @@ -401,19 +416,19 @@ pub enum BuildMicrovmFromSnapshotError { ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError), /// VMGenID update failed: {0} VMGenIDUpdate(std::io::Error), + /// Failed to restore guest memory: {0} + Memory(#[from] RestoreMemoryError), } /// Builds and starts a microVM based on the provided MicrovmState. /// /// An `Arc` reference of the built `Vmm` is also plugged in the `EventManager`, while another /// is returned. -#[allow(clippy::too_many_arguments)] pub fn build_microvm_from_snapshot( instance_info: &InstanceInfo, event_manager: &mut EventManager, microvm_state: MicrovmState, - guest_memory: Vec, - uffd: Option, + mem_backend: &MemBackendConfig, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, ) -> Result>, BuildMicrovmFromSnapshotError> { @@ -427,11 +442,63 @@ pub fn build_microvm_from_snapshot( ) .map_err(StartMicrovmError::Internal)?; + let track_dirty_pages = vm_resources.machine_config.track_dirty_pages; + let huge_pages = vm_resources.machine_config.huge_pages; + + let mem_backend_path = &mem_backend.backend_path; + + let mem_file = match mem_backend.backend_type { + MemBackendType::File => Some(mem_backend_path), + MemBackendType::Uffd => None, + }; + + let guest_memory = restore_memory( + µvm_state.vm_state.memory, + mem_file, + huge_pages, + track_dirty_pages, + 0, + )?; + let io_memory = restore_memory( + µvm_state.vm_state.io_memory, + mem_file, + huge_pages, + track_dirty_pages, + guest_memory.iter().map(|r| r.len()).sum(), + )?; + vmm.vm .register_memory_regions(guest_memory) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; - vmm.uffd = uffd; + + for region in io_memory { + vmm.vm + .register_swiotlb_region(region) + .map_err(VmmError::Vm) + .map_err(StartMicrovmError::Internal)?; + } + + vmm.uffd = match mem_backend.backend_type { + MemBackendType::File => None, + MemBackendType::Uffd => { + let (uffd, mut mappings) = vmm + .vm + .create_uffd() + .map_err(RestoreMemoryError::UffdCreate)?; + + #[allow(deprecated)] + mappings.iter_mut().for_each(|mapping| { + mapping.page_size = vm_resources.machine_config.huge_pages.page_size(); + mapping.page_size_kib = vm_resources.machine_config.huge_pages.page_size(); + }); + + send_uffd_handshake(mem_backend_path, &mappings, &uffd) + .map_err(RestoreMemoryError::UffdHandshake)?; + + Some(uffd) + } + }; #[cfg(target_arch = "x86_64")] { @@ -471,7 +538,7 @@ pub fn build_microvm_from_snapshot( // Restore devices states. let mmio_ctor_args = MMIODevManagerConstructorArgs { - mem: vmm.vm.guest_memory(), + mem: vmm.vm.io_memory(), vm: vmm.vm.fd(), event_manager, resource_allocator: &mut vmm.resource_allocator, @@ -595,8 +662,15 @@ fn attach_virtio_device( ) -> Result<(), MmioError> { event_manager.add_subscriber(device.clone()); + // We have to enable swiotlb as part of the boot process, because the device objects + // themselves are created when the corresponding PUT API calls are made, and at that + // point we don't know yet whether swiotlb should be enabled or not. + if vmm.vm.has_swiotlb() { + device.lock().unwrap().force_swiotlb(); + } + // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(vmm.vm.guest_memory().clone(), device, is_vhost_user); + let device = MmioTransport::new(vmm.vm.io_memory().clone(), device, is_vhost_user); vmm.mmio_device_manager .register_mmio_virtio_for_boot( vmm.vm.fd(), diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 99bde6e2e78..8b2f5b5c387 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -600,6 +600,10 @@ mod tests { fn set_acked_features(&mut self, _: u64) {} + fn force_swiotlb(&mut self) { + unimplemented!() + } + fn device_type(&self) -> u32 { 0 } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 186f09275bc..195f78fb6c6 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -25,7 +25,9 @@ use super::{ }; use crate::devices::virtio::balloon::BalloonError; use crate::devices::virtio::device::{IrqTrigger, IrqType}; -use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; +use crate::devices::virtio::generated::virtio_config::{ + VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, +}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; @@ -557,6 +559,10 @@ impl VirtioDevice for Balloon { self.acked_features = acked_features; } + fn force_swiotlb(&mut self) { + self.avail_features |= 1 << VIRTIO_F_ACCESS_PLATFORM; + } + fn device_type(&self) -> u32 { TYPE_BALLOON } diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index bf3043bcdd4..f777d876001 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -148,6 +148,13 @@ impl VirtioDevice for Block { } } + fn force_swiotlb(&mut self) { + match self { + Block::Virtio(b) => b.force_swiotlb(), + Block::VhostUser(b) => b.force_swiotlb(), + } + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index b0bf5a31e3f..864d38d90ac 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -16,7 +16,9 @@ use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; -use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; +use crate::devices::virtio::generated::virtio_config::{ + VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, +}; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::vhost_user::{VhostUserHandleBackend, VhostUserHandleImpl}; @@ -294,6 +296,10 @@ impl VirtioDevice for VhostUserBlock self.acked_features = acked_features; } + fn force_swiotlb(&mut self) { + self.avail_features |= 1 << VIRTIO_F_ACCESS_PLATFORM; + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index e2b134a9f25..d867892bccc 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -27,7 +27,9 @@ use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDev use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; -use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; +use crate::devices::virtio::generated::virtio_config::{ + VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, +}; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; @@ -578,6 +580,10 @@ impl VirtioDevice for VirtioBlock { self.acked_features = acked_features; } + fn force_swiotlb(&mut self) { + self.avail_features |= 1 << VIRTIO_F_ACCESS_PLATFORM; + } + fn device_type(&self) -> u32 { TYPE_BLOCK } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 79fe7c0c77d..96e099e7202 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -188,6 +188,7 @@ pub mod tests { #![allow(clippy::undocumented_unsafe_blocks)] use std::os::unix::ffi::OsStrExt; + use vm_memory::GuestMemoryRegion; use vmm_sys_util::tempfile::TempFile; use super::*; @@ -195,7 +196,7 @@ pub mod tests { use crate::utils::u64_to_usize; use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory; - use crate::vstate::memory::{Bitmap, Bytes, GuestMemory}; + use crate::vstate::memory::{Bitmap, Bytes, GuestMemory, KvmRegion}; const FILE_LEN: u32 = 1024; // 2 pages of memory should be enough to test read/write ops and also dirty tracking. @@ -237,7 +238,10 @@ pub mod tests { true, HugePageConfig::None, ) - .unwrap(), + .unwrap() + .into_iter() + .map(|region| KvmRegion::from_mmap_region(region, 0)) + .collect(), ) .unwrap() } diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 62131e775f5..f7dbc9d73ef 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -102,6 +102,9 @@ pub trait VirtioDevice: AsAny + Send { /// - self.avail_features() & self.acked_features() = self.get_acked_features() fn set_acked_features(&mut self, acked_features: u64); + /// Make the virtio device offer the VIRTIO_F_ACCESS_PLATFORM feature + fn force_swiotlb(&mut self); + /// Check if virtio device has negotiated given feature. fn has_feature(&self, feature: u64) -> bool { (self.acked_features() & (1 << feature)) != 0 @@ -259,6 +262,10 @@ pub(crate) mod tests { todo!() } + fn force_swiotlb(&mut self) { + unimplemented!() + } + fn device_type(&self) -> u32 { todo!() } diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/mmio.rs index 12ee54bfb0a..3f75d6d35fd 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/mmio.rs @@ -423,6 +423,10 @@ pub(crate) mod tests { self.acked_features = acked_features; } + fn force_swiotlb(&mut self) { + unimplemented!() + } + fn device_type(&self) -> u32 { 123 } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index fff04d1da1a..f127fe9a0e4 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -16,7 +16,9 @@ use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; -use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; +use crate::devices::virtio::generated::virtio_config::{ + VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, +}; use crate::devices::virtio::generated::virtio_net::{ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, @@ -946,6 +948,10 @@ impl VirtioDevice for Net { self.acked_features = acked_features; } + fn force_swiotlb(&mut self) { + self.avail_features |= 1 << VIRTIO_F_ACCESS_PLATFORM; + } + fn device_type(&self) -> u32 { TYPE_NET } diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index efe42bfc3dc..1aeadc01766 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -308,6 +308,15 @@ impl Queue { Ok(slice.ptr_guard_mut().as_ptr()) } + fn log_queue_component_address(component: &'static str, addr: GuestAddress, size: usize) { + log::info!( + "Placed virt queue {} at [{}, {})", + component, + addr.0, + addr.0 + size as u64 + ); + } + /// Set up pointers to the queue objects in the guest memory /// and mark memory dirty for those objects pub fn initialize(&mut self, mem: &M) -> Result<(), QueueError> { @@ -353,6 +362,22 @@ impl Queue { )); } + Self::log_queue_component_address( + "descriptor table", + self.desc_table_address, + self.desc_table_size(), + ); + Self::log_queue_component_address( + "avail ring", + self.avail_ring_address, + self.avail_ring_size(), + ); + Self::log_queue_component_address( + "used ring", + self.used_ring_address, + self.used_ring_size(), + ); + Ok(()) } diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 97ac8676e0a..719365e8ce8 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -13,7 +13,9 @@ use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; use crate::devices::DeviceError; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; -use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; +use crate::devices::virtio::generated::virtio_config::{ + VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_VERSION_1, +}; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::queue::{FIRECRACKER_MAX_QUEUE_SIZE, Queue}; @@ -303,6 +305,10 @@ impl VirtioDevice for Entropy { self.device_state = DeviceState::Activated(mem); Ok(()) } + + fn force_swiotlb(&mut self) { + self.avail_features |= 1 << VIRTIO_F_ACCESS_PLATFORM; + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 83174fbc4d3..9b21ec7a506 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -380,7 +380,7 @@ impl VhostUserHandleImpl { let vhost_user_net_reg = VhostUserMemoryRegionInfo { guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, + userspace_addr: region.inner().userspace_addr, mmap_offset, mmap_handle, }; @@ -469,7 +469,7 @@ pub(crate) mod tests { use super::*; use crate::test_utils::create_tmp_socket; use crate::vstate::memory; - use crate::vstate::memory::GuestAddress; + use crate::vstate::memory::{GuestAddress, KvmRegion}; pub(crate) fn create_mem(file: File, regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { GuestMemoryMmap::from_regions( @@ -478,8 +478,12 @@ pub(crate) mod tests { libc::MAP_PRIVATE, Some(file), false, + 0, ) - .unwrap(), + .unwrap() + .into_iter() + .map(|region| KvmRegion::from_mmap_region(region, 0)) + .collect(), ) .unwrap() } @@ -789,7 +793,7 @@ pub(crate) mod tests { .map(|region| VhostUserMemoryRegionInfo { guest_phys_addr: region.start_addr().raw_value(), memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, + userspace_addr: region.inner().userspace_addr, mmap_offset: region.file_offset().unwrap().start(), mmap_handle: region.file_offset().unwrap().file().as_raw_fd(), }) diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index aa114f6cccb..e3b1bdae5cd 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -31,7 +31,9 @@ use super::packet::{VSOCK_PKT_HDR_SIZE, VsockPacketRx, VsockPacketTx}; use super::{VsockBackend, defs}; use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; -use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; +use crate::devices::virtio::generated::virtio_config::{ + VIRTIO_F_ACCESS_PLATFORM, VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1, +}; use crate::devices::virtio::queue::Queue as VirtQueue; use crate::devices::virtio::vsock::VsockError; use crate::devices::virtio::vsock::metrics::METRICS; @@ -280,6 +282,10 @@ where self.acked_features = acked_features } + fn force_swiotlb(&mut self) { + self.avail_features |= 1 << VIRTIO_F_ACCESS_PLATFORM; + } + fn device_type(&self) -> u32 { uapi::VIRTIO_ID_VSOCK } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 29f3b0148ac..03995c844b3 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -114,6 +114,8 @@ pub mod vstate; /// Module with initrd. pub mod initrd; +pub mod vm_memory_vendored; + use std::collections::HashMap; use std::io; use std::os::unix::io::AsRawFd; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index be2fbc0a040..a5d64cd8b32 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -9,12 +9,11 @@ use std::io::{self, Write}; use std::mem::forget; use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::sync::{Arc, Mutex}; use semver::Version; use serde::{Deserialize, Serialize}; -use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; use vmm_sys_util::sock_ctrl_msg::ScmSocket; #[cfg(target_arch = "aarch64")] @@ -33,8 +32,10 @@ use crate::snapshot::Snapshot; use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; -use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigError, MachineConfigUpdate}; -use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, MemBackendType}; +use crate::vmm_config::machine_config::{ + HugePageConfig, MachineConfigError, MachineConfigUpdate, MemoryConfig, +}; +use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams}; use crate::vstate::kvm::KvmState; use crate::vstate::memory; use crate::vstate::memory::{GuestMemoryState, GuestRegionMmap, MemoryError}; @@ -47,6 +48,8 @@ use crate::{EventManager, Vmm, vstate}; pub struct VmInfo { /// Guest memory size. pub mem_size_mib: u64, + /// Memory config + pub mem_config: MemoryConfig, /// smt information pub smt: bool, /// CPU template type @@ -61,6 +64,7 @@ impl From<&VmResources> for VmInfo { fn from(value: &VmResources) -> Self { Self { mem_size_mib: value.machine_config.mem_size_mib as u64, + mem_config: value.machine_config.mem_config, smt: value.machine_config.smt, cpu_template: StaticCpuTemplate::from(&value.machine_config.cpu_template), boot_source: value.boot_source.config.clone(), @@ -94,7 +98,7 @@ pub struct MicrovmState { /// E.g. Guest memory contents for a region of `size` bytes can be found in the /// backend at `offset` bytes from the beginning, and should be copied/populated /// into `base_host_address`. -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Clone, Debug, Default, Serialize, Deserialize, PartialEq, Eq)] pub struct GuestRegionUffdMapping { /// Base host virtual address where the guest memory contents for this /// region should be copied/populated. @@ -175,7 +179,7 @@ pub fn create_snapshot( .for_each_virtio_device(|_, _, _, dev| { let d = dev.lock().unwrap(); if d.is_activated() { - d.mark_queue_memory_dirty(vmm.vm.guest_memory()) + d.mark_queue_memory_dirty(vmm.vm.io_memory()) } else { Ok(()) } @@ -276,22 +280,14 @@ pub fn validate_cpu_manufacturer_id(microvm_state: &MicrovmState) { } } } -/// Error type for [`snapshot_state_sanity_check`]. -#[derive(Debug, thiserror::Error, displaydoc::Display, PartialEq, Eq)] -pub enum SnapShotStateSanityCheckError { - /// No memory region defined. - NoMemory, -} /// Performs sanity checks against the state file and returns specific errors. -pub fn snapshot_state_sanity_check( - microvm_state: &MicrovmState, -) -> Result<(), SnapShotStateSanityCheckError> { +pub fn snapshot_state_sanity_check(microvm_state: &MicrovmState) -> Result<(), RestoreMemoryError> { // Check if the snapshot contains at least 1 mem region. // Upper bound check will be done when creating guest memory by comparing against // KVM max supported value kvm_context.max_memslots(). if microvm_state.vm_state.memory.regions.is_empty() { - return Err(SnapShotStateSanityCheckError::NoMemory); + return Err(RestoreMemoryError::NoMemory); } #[cfg(target_arch = "x86_64")] @@ -302,26 +298,44 @@ pub fn snapshot_state_sanity_check( Ok(()) } -/// Error type for [`restore_from_snapshot`]. +/// Errors that can happen during restoration of snapshot memory. #[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum RestoreFromSnapshotError { - /// Failed to get snapshot state from file: {0} - File(#[from] SnapshotStateFromFileError), - /// Invalid snapshot state: {0} - Invalid(#[from] SnapShotStateSanityCheckError), - /// Failed to load guest memory: {0} - GuestMemory(#[from] RestoreFromSnapshotGuestMemoryError), - /// Failed to build microVM from snapshot: {0} - Build(#[from] BuildMicrovmFromSnapshotError), +pub enum RestoreMemoryError { + /// Failed to open memory backend file: {0} + Open(#[from] std::io::Error), + /// Attempt to restore a hugetlbfs snapshot by mmaping the memory file. Please + MapHugetlbfs, + /// Failure allocating/mapping memory during snapshot restore: {0} + Memory(#[from] MemoryError), + /// Failure to create UFFD: {0} + UffdCreate(#[from] userfaultfd::Error), + /// Failure to send UFFD handshake to handler process: {0} + UffdHandshake(vmm_sys_util::errno::Error), + /// No memory region defined. + NoMemory, } -/// Sub-Error type for [`restore_from_snapshot`] to contain either [`GuestMemoryFromFileError`] or -/// [`GuestMemoryFromUffdError`] within [`RestoreFromSnapshotError`]. -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum RestoreFromSnapshotGuestMemoryError { - /// Error creating guest memory from file: {0} - File(#[from] GuestMemoryFromFileError), - /// Error creating guest memory from uffd: {0} - Uffd(#[from] GuestMemoryFromUffdError), + +/// Restore the guest memory regions from the given state +pub fn restore_memory( + state: &GuestMemoryState, + path: Option<&PathBuf>, + huge_pages: HugePageConfig, + track_dirty: bool, + offset: u64, +) -> Result, RestoreMemoryError> { + let mem_regions = match path { + Some(path) => { + if huge_pages.is_hugetlbfs() { + return Err(RestoreMemoryError::MapHugetlbfs); + } + + let mem_file = File::open(path)?; + memory::snapshot_file(mem_file, state.regions(), track_dirty, offset)? + } + None => memory::anonymous(state.regions(), track_dirty, huge_pages)?, + }; + + Ok(mem_regions) } /// Loads a Microvm snapshot producing a 'paused' Microvm. @@ -331,7 +345,7 @@ pub fn restore_from_snapshot( seccomp_filters: &BpfThreadMap, params: &LoadSnapshotParams, vm_resources: &mut VmResources, -) -> Result>, RestoreFromSnapshotError> { +) -> Result>, BuildMicrovmFromSnapshotError> { let mut microvm_state = snapshot_state_from_file(¶ms.snapshot_path)?; for entry in ¶ms.network_overrides { let net_devices = &mut microvm_state.device_states.net_devices; @@ -347,7 +361,6 @@ pub fn restore_from_snapshot( return Err(SnapshotStateFromFileError::UnknownNetworkDevice.into()); } } - let track_dirty_pages = params.enable_diff_snapshots; let vcpu_count = microvm_state .vcpu_states @@ -360,9 +373,10 @@ pub fn restore_from_snapshot( .update_machine_config(&MachineConfigUpdate { vcpu_count: Some(vcpu_count), mem_size_mib: Some(u64_to_usize(microvm_state.vm_info.mem_size_mib)), + mem_config: Some(microvm_state.vm_info.mem_config), smt: Some(microvm_state.vm_info.smt), cpu_template: Some(microvm_state.vm_info.cpu_template), - track_dirty_pages: Some(track_dirty_pages), + track_dirty_pages: Some(params.enable_diff_snapshots), huge_pages: Some(microvm_state.vm_info.huge_pages), #[cfg(feature = "gdb")] gdb_socket_path: None, @@ -372,41 +386,14 @@ pub fn restore_from_snapshot( // Some sanity checks before building the microvm. snapshot_state_sanity_check(µvm_state)?; - let mem_backend_path = ¶ms.mem_backend.backend_path; - let mem_state = µvm_state.vm_state.memory; - - let (guest_memory, uffd) = match params.mem_backend.backend_type { - MemBackendType::File => { - if vm_resources.machine_config.huge_pages.is_hugetlbfs() { - return Err(RestoreFromSnapshotGuestMemoryError::File( - GuestMemoryFromFileError::HugetlbfsSnapshot, - ) - .into()); - } - ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) - .map_err(RestoreFromSnapshotGuestMemoryError::File)?, - None, - ) - } - MemBackendType::Uffd => guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - vm_resources.machine_config.huge_pages, - ) - .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?, - }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, - guest_memory, - uffd, + ¶ms.mem_backend, seccomp_filters, vm_resources, ) - .map_err(RestoreFromSnapshotError::Build) } /// Error type for [`snapshot_state_from_file`] @@ -447,93 +434,11 @@ pub enum GuestMemoryFromFileError { HugetlbfsSnapshot, } -fn guest_memory_from_file( - mem_file_path: &Path, - mem_state: &GuestMemoryState, - track_dirty_pages: bool, -) -> Result, GuestMemoryFromFileError> { - let mem_file = File::open(mem_file_path)?; - let guest_mem = memory::snapshot_file(mem_file, mem_state.regions(), track_dirty_pages)?; - Ok(guest_mem) -} - -/// Error type for [`guest_memory_from_uffd`] -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum GuestMemoryFromUffdError { - /// Failed to restore guest memory: {0} - Restore(#[from] MemoryError), - /// Failed to UFFD object: {0} - Create(userfaultfd::Error), - /// Failed to register memory address range with the userfaultfd object: {0} - Register(userfaultfd::Error), - /// Failed to connect to UDS Unix stream: {0} - Connect(#[from] std::io::Error), - /// Failed to sends file descriptor: {0} - Send(#[from] vmm_sys_util::errno::Error), -} - -fn guest_memory_from_uffd( - mem_uds_path: &Path, - mem_state: &GuestMemoryState, - track_dirty_pages: bool, - huge_pages: HugePageConfig, -) -> Result<(Vec, Option), GuestMemoryFromUffdError> { - let (guest_memory, backend_mappings) = - create_guest_memory(mem_state, track_dirty_pages, huge_pages)?; - - let mut uffd_builder = UffdBuilder::new(); - - // We only make use of this if balloon devices are present, but we can enable it unconditionally - // because the only place the kernel checks this is in a hook from madvise, e.g. it doesn't - // actively change the behavior of UFFD, only passively. Without balloon devices - // we never call madvise anyway, so no need to put this into a conditional. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); - - let uffd = uffd_builder - .close_on_exec(true) - .non_blocking(true) - .user_mode_only(false) - .create() - .map_err(GuestMemoryFromUffdError::Create)?; - - for mem_region in guest_memory.iter() { - uffd.register(mem_region.as_ptr().cast(), mem_region.size() as _) - .map_err(GuestMemoryFromUffdError::Register)?; - } - - send_uffd_handshake(mem_uds_path, &backend_mappings, &uffd)?; - - Ok((guest_memory, Some(uffd))) -} - -fn create_guest_memory( - mem_state: &GuestMemoryState, - track_dirty_pages: bool, - huge_pages: HugePageConfig, -) -> Result<(Vec, Vec), GuestMemoryFromUffdError> { - let guest_memory = memory::anonymous(mem_state.regions(), track_dirty_pages, huge_pages)?; - let mut backend_mappings = Vec::with_capacity(guest_memory.len()); - let mut offset = 0; - for mem_region in guest_memory.iter() { - #[allow(deprecated)] - backend_mappings.push(GuestRegionUffdMapping { - base_host_virt_addr: mem_region.as_ptr() as u64, - size: mem_region.size(), - offset, - page_size: huge_pages.page_size(), - page_size_kib: huge_pages.page_size(), - }); - offset += mem_region.size() as u64; - } - - Ok((guest_memory, backend_mappings)) -} - -fn send_uffd_handshake( +pub(crate) fn send_uffd_handshake( mem_uds_path: &Path, backend_mappings: &[GuestRegionUffdMapping], uffd: &impl AsRawFd, -) -> Result<(), GuestMemoryFromUffdError> { +) -> Result<(), vmm_sys_util::errno::Error> { // This is safe to unwrap() because we control the contents of the vector // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(backend_mappings).unwrap(); @@ -603,7 +508,6 @@ mod tests { use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; - use crate::vstate::memory::GuestMemoryRegionState; fn default_vmm_with_devices() -> Vmm { let mut event_manager = EventManager::new().expect("Cannot create EventManager"); @@ -700,24 +604,6 @@ mod tests { ) } - #[test] - fn test_create_guest_memory() { - let mem_state = GuestMemoryState { - regions: vec![GuestMemoryRegionState { - base_address: 0, - size: 0x20000, - }], - }; - - let (_, uffd_regions) = - create_guest_memory(&mem_state, false, HugePageConfig::None).unwrap(); - - assert_eq!(uffd_regions.len(), 1); - assert_eq!(uffd_regions[0].size, 0x20000); - assert_eq!(uffd_regions[0].offset, 0); - assert_eq!(uffd_regions[0].page_size, HugePageConfig::None.page_size()); - } - #[test] fn test_send_uffd_handshake() { #[allow(deprecated)] diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 097e2041b55..5db9dc9a8dc 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -437,29 +437,39 @@ impl VmResources { Ok(()) } - /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. - /// - /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise - /// prefers anonymous memory for performance reasons. - pub fn allocate_guest_memory(&self) -> Result, MemoryError> { - let vhost_user_device_used = self - .block + /// Returns true if any vhost user devices are configured int his [`VmResources`] object + pub fn vhost_user_devices_used(&self) -> bool { + self.block .devices .iter() - .any(|b| b.lock().expect("Poisoned lock").is_vhost_user()); + .any(|b| b.lock().expect("Poisoned lock").is_vhost_user()) + } - // Page faults are more expensive for shared memory mapping, including memfd. - // For this reason, we only back guest memory with a memfd - // if a vhost-user-blk device is configured in the VM, otherwise we fall back to - // an anonymous private memory. - // - // The vhost-user-blk branch is not currently covered by integration tests in Rust, - // because that would require running a backend process. If in the future we converge to - // a single way of backing guest memory for vhost-user and non-vhost-user cases, - // that would not be worth the effort. - let regions = - crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); - if vhost_user_device_used { + /// The size of the swiotlb region requested, in MiB + #[cfg(target_arch = "aarch64")] + pub fn swiotlb_size_mib(&self) -> usize { + self.machine_config.mem_config.initial_swiotlb_size + } + + /// The size of the swiotlb region requested, in MiB + #[cfg(target_arch = "x86_64")] + pub fn swiotlb_size_mib(&self) -> usize { + 0 + } + + /// Whether the use of swiotlb was requested + pub fn swiotlb_used(&self) -> bool { + self.swiotlb_size_mib() > 0 + } + + fn allocate_memory( + &self, + offset: usize, + size: usize, + vhost_accessible: bool, + ) -> Result, MemoryError> { + let regions = crate::arch::arch_memory_regions(offset, size); + if vhost_accessible { memory::memfd_backed( regions.as_ref(), self.machine_config.track_dirty_pages, @@ -473,6 +483,47 @@ impl VmResources { ) } } + + /// Allocates guest memory in a configuration most appropriate for these [`VmResources`]. + /// + /// If vhost-user-blk devices are in use, allocates memfd-backed shared memory, otherwise + /// prefers anonymous memory for performance reasons. + pub fn allocate_guest_memory(&self) -> Result, MemoryError> { + // Page faults are more expensive for shared memory mapping, including memfd. + // For this reason, we only back guest memory with a memfd + // if a vhost-user-blk device is configured in the VM, otherwise we fall back to + // an anonymous private memory. + // + // Note that if a swiotlb region is used, no I/O will go through the "regular" + // memory regions, and we can back them with anon memory regardless. + // + // The vhost-user-blk branch is not currently covered by integration tests in Rust, + // because that would require running a backend process. If in the future we converge to + // a single way of backing guest memory for vhost-user and non-vhost-user cases, + // that would not be worth the effort. + self.allocate_memory( + 0, + mib_to_bytes(self.machine_config.mem_size_mib - self.swiotlb_size_mib()), + self.vhost_user_devices_used() && !self.swiotlb_used(), + ) + } + + /// Allocates the dedicated I/O region for swiotlb use, if one was requested. + pub fn allocate_swiotlb_region(&self) -> Result, MemoryError> { + if !self.swiotlb_used() { + return Ok(None); + } + + let swiotlb_size = mib_to_bytes(self.swiotlb_size_mib()); + let start = mib_to_bytes(self.machine_config.mem_size_mib) - swiotlb_size; + let start = start.max(crate::arch::bytes_before_last_gap()); + + let mut mem = self.allocate_memory(start, swiotlb_size, self.vhost_user_devices_used())?; + + assert_eq!(mem.len(), 1); + + Ok(Some(mem.remove(0))) + } } impl From<&VmResources> for VmmConfig { @@ -1302,6 +1353,7 @@ mod tests { let mut aux_vm_config = MachineConfigUpdate { vcpu_count: Some(32), mem_size_mib: Some(512), + mem_config: Some(Default::default()), smt: Some(false), #[cfg(target_arch = "x86_64")] cpu_template: Some(StaticCpuTemplate::T2), @@ -1323,44 +1375,6 @@ mod tests { aux_vm_config ); - // Invalid vcpu count. - aux_vm_config.vcpu_count = Some(0); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - aux_vm_config.vcpu_count = Some(33); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - - // Check that SMT is not supported on aarch64, and that on x86_64 enabling it requires vcpu - // count to be even. - aux_vm_config.smt = Some(true); - #[cfg(target_arch = "aarch64")] - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::SmtNotSupported) - ); - aux_vm_config.vcpu_count = Some(3); - #[cfg(target_arch = "x86_64")] - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidVcpuCount) - ); - aux_vm_config.vcpu_count = Some(32); - #[cfg(target_arch = "x86_64")] - vm_resources.update_machine_config(&aux_vm_config).unwrap(); - aux_vm_config.smt = Some(false); - - // Invalid mem_size_mib. - aux_vm_config.mem_size_mib = Some(0); - assert_eq!( - vm_resources.update_machine_config(&aux_vm_config), - Err(MachineConfigError::InvalidMemorySize) - ); - // Incompatible mem_size_mib with balloon size. vm_resources.machine_config.mem_size_mib = 128; vm_resources @@ -1379,23 +1393,6 @@ mod tests { // mem_size_mib compatible with balloon size. aux_vm_config.mem_size_mib = Some(256); vm_resources.update_machine_config(&aux_vm_config).unwrap(); - - // mem_size_mib incompatible with huge pages configuration - aux_vm_config.mem_size_mib = Some(129); - aux_vm_config.huge_pages = Some(HugePageConfig::Hugetlbfs2M); - assert_eq!( - vm_resources - .update_machine_config(&aux_vm_config) - .unwrap_err(), - MachineConfigError::InvalidMemorySize - ); - - // mem_size_mib compatible with huge page configuration - aux_vm_config.mem_size_mib = Some(2048); - // Remove the balloon device config that's added by `default_vm_resources` as it would - // trigger the "ballooning incompatible with huge pages" check. - vm_resources.balloon = BalloonBuilder::new(); - vm_resources.update_machine_config(&aux_vm_config).unwrap(); } #[test] diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 127b75e594e..81747c9bf1a 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -7,7 +7,7 @@ use std::sync::{Arc, Mutex, MutexGuard}; use serde_json::Value; use utils::time::{ClockType, get_time_us}; -use super::builder::build_and_boot_microvm; +use super::builder::{BuildMicrovmFromSnapshotError, build_and_boot_microvm}; use super::persist::{create_snapshot, restore_from_snapshot}; use super::resources::VmResources; use super::{Vmm, VmmError}; @@ -16,7 +16,7 @@ use crate::builder::StartMicrovmError; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::logger::{LoggerConfig, info, warn, *}; use crate::mmds::data_store::{self, Mmds}; -use crate::persist::{CreateSnapshotError, RestoreFromSnapshotError, VmInfo}; +use crate::persist::{CreateSnapshotError, VmInfo}; use crate::resources::VmmConfig; use crate::seccomp::BpfThreadMap; use crate::vmm_config::balloon::{ @@ -275,7 +275,7 @@ pub enum LoadSnapshotError { /// Loading a microVM snapshot not allowed after configuring boot-specific resources. LoadSnapshotNotAllowed, /// Failed to restore from snapshot: {0} - RestoreFromSnapshot(#[from] RestoreFromSnapshotError), + RestoreFromSnapshot(#[from] BuildMicrovmFromSnapshotError), /// Failed to resume microVM: {0} ResumeMicrovm(#[from] VmmError), } diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 7cb16a2a213..9bec192fc0e 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -12,11 +12,12 @@ use crate::builder::build_microvm_for_boot; use crate::resources::VmResources; use crate::seccomp::get_empty_filters; use crate::test_utils::mock_resources::{MockBootSourceConfig, MockVmConfig, MockVmResources}; +use crate::vm_memory_vendored::GuestRegionCollection; use crate::vmm_config::boot_source::BootSourceConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::HugePageConfig; use crate::vstate::memory; -use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap}; +use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap, KvmRegion}; use crate::{EventManager, Vmm}; pub mod mock_resources; @@ -43,9 +44,12 @@ pub fn single_region_mem_at_raw(at: u64, size: usize) -> Vec { /// Creates a [`GuestMemoryMmap`] with multiple regions and without dirty page tracking. pub fn multi_region_mem(regions: &[(GuestAddress, usize)]) -> GuestMemoryMmap { - GuestMemoryMmap::from_regions( + GuestRegionCollection::from_regions( memory::anonymous(regions.iter().copied(), false, HugePageConfig::None) - .expect("Cannot initialize memory"), + .expect("Cannot initialize memory") + .into_iter() + .map(|region| KvmRegion::from_mmap_region(region, 0)) + .collect(), ) .unwrap() } diff --git a/src/vmm/src/vm_memory_vendored.rs b/src/vmm/src/vm_memory_vendored.rs new file mode 100644 index 00000000000..c260c8addad --- /dev/null +++ b/src/vmm/src/vm_memory_vendored.rs @@ -0,0 +1,292 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Module temporarily containing vendored in-review vm-memory features +//! +//! TODO: To be removed once https://github.com/rust-vmm/vm-memory/pull/312 is merged + +#![allow(clippy::cast_possible_truncation)] // vm-memory has different clippy configuration + +use std::io::{Read, Write}; +use std::sync::Arc; +use std::sync::atomic::Ordering; + +use vm_memory::guest_memory::Result; +use vm_memory::{ + Address, AtomicAccess, Bytes, Error, GuestAddress, GuestMemory, GuestMemoryError, + GuestMemoryRegion, GuestUsize, MemoryRegionAddress, +}; + +use crate::vstate::memory::KvmRegion; + +/// [`GuestMemory`](trait.GuestMemory.html) implementation based on a homogeneous collection +/// of [`GuestMemoryRegion`] implementations. +/// +/// Represents a sorted set of non-overlapping physical guest memory regions. +#[derive(Debug)] +pub struct GuestRegionCollection { + regions: Vec>, +} + +impl Default for GuestRegionCollection { + fn default() -> Self { + Self { + regions: Vec::new(), + } + } +} + +impl Clone for GuestRegionCollection { + fn clone(&self) -> Self { + GuestRegionCollection { + regions: self.regions.iter().map(Arc::clone).collect(), + } + } +} + +impl GuestRegionCollection { + /// Creates an empty `GuestMemoryMmap` instance. + pub fn new() -> Self { + Self::default() + } + + /// Creates a new [`GuestRegionCollection`] from a vector of regions. + /// + /// # Arguments + /// + /// * `regions` - The vector of regions. The regions shouldn't overlap, and they should be + /// sorted by the starting address. + pub fn from_regions(mut regions: Vec) -> std::result::Result { + Self::from_arc_regions(regions.drain(..).map(Arc::new).collect()) + } + + /// Creates a new [`GuestRegionCollection`] from a vector of Arc regions. + /// + /// Similar to the constructor `from_regions()` as it returns a + /// [`GuestRegionCollection`]. The need for this constructor is to provide a way for + /// consumer of this API to create a new [`GuestRegionCollection`] based on existing + /// regions coming from an existing [`GuestRegionCollection`] instance. + /// + /// # Arguments + /// + /// * `regions` - The vector of `Arc` regions. The regions shouldn't overlap and they should be + /// sorted by the starting address. + pub fn from_arc_regions(regions: Vec>) -> std::result::Result { + if regions.is_empty() { + return Err(Error::NoMemoryRegion); + } + + for window in regions.windows(2) { + let prev = &window[0]; + let next = &window[1]; + + if prev.start_addr() > next.start_addr() { + return Err(Error::UnsortedMemoryRegions); + } + + if prev.last_addr() >= next.start_addr() { + return Err(Error::MemoryRegionOverlap); + } + } + + Ok(Self { regions }) + } + + /// Insert a region into the `GuestMemoryMmap` object and return a new `GuestMemoryMmap`. + /// + /// # Arguments + /// * `region`: the memory region to insert into the guest memory object. + pub fn insert_region( + &self, + region: Arc, + ) -> std::result::Result, Error> { + let mut regions = self.regions.clone(); + regions.push(region); + regions.sort_by_key(|x| x.start_addr()); + + Self::from_arc_regions(regions) + } + + /// Remove a region from the [`GuestRegionCollection`] object and return a new + /// `GuestRegionCollection` on success, together with the removed region. + /// + /// # Arguments + /// * `base`: base address of the region to be removed + /// * `size`: size of the region to be removed + pub fn remove_region( + &self, + base: GuestAddress, + size: GuestUsize, + ) -> std::result::Result<(GuestRegionCollection, Arc), Error> { + if let Ok(region_index) = self.regions.binary_search_by_key(&base, |x| x.start_addr()) { + if self.regions.get(region_index).unwrap().len() == size { + let mut regions = self.regions.clone(); + let region = regions.remove(region_index); + return Ok((Self { regions }, region)); + } + } + + Err(Error::NoMemoryRegion) + } +} + +impl GuestMemory for GuestRegionCollection { + type R = R; + + fn num_regions(&self) -> usize { + self.regions.len() + } + + fn find_region(&self, addr: GuestAddress) -> Option<&R> { + let index = match self.regions.binary_search_by_key(&addr, |x| x.start_addr()) { + Ok(x) => Some(x), + // Within the closest region with starting address < addr + Err(x) if (x > 0 && addr <= self.regions[x - 1].last_addr()) => Some(x - 1), + _ => None, + }; + index.map(|x| self.regions[x].as_ref()) + } + + fn iter(&self) -> impl Iterator { + self.regions.iter().map(AsRef::as_ref) + } +} + +// This impl will be subsumed by the default impl in vm-memory#312 +impl Bytes for KvmRegion { + type E = GuestMemoryError; + + /// # Examples + /// * Write a slice at guest address 0x1200. + /// + /// ``` + /// # #[cfg(feature = "backend-mmap")] + /// # use vm_memory::{Bytes, GuestAddress, GuestMemoryMmap}; + /// # + /// # #[cfg(feature = "backend-mmap")] + /// # { + /// # let start_addr = GuestAddress(0x1000); + /// # let mut gm = GuestMemoryMmap::<()>::from_ranges(&vec![(start_addr, 0x400)]) + /// # .expect("Could not create guest memory"); + /// # + /// let res = gm + /// .write(&[1, 2, 3, 4, 5], GuestAddress(0x1200)) + /// .expect("Could not write to guest memory"); + /// assert_eq!(5, res); + /// # } + /// ``` + fn write(&self, buf: &[u8], addr: MemoryRegionAddress) -> Result { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .write(buf, maddr) + .map_err(Into::into) + } + + /// # Examples + /// * Read a slice of length 16 at guestaddress 0x1200. + /// + /// ``` + /// # #[cfg(feature = "backend-mmap")] + /// # use vm_memory::{Bytes, GuestAddress, GuestMemoryMmap}; + /// # + /// # #[cfg(feature = "backend-mmap")] + /// # { + /// # let start_addr = GuestAddress(0x1000); + /// # let mut gm = GuestMemoryMmap::<()>::from_ranges(&vec![(start_addr, 0x400)]) + /// # .expect("Could not create guest memory"); + /// # + /// let buf = &mut [0u8; 16]; + /// let res = gm + /// .read(buf, GuestAddress(0x1200)) + /// .expect("Could not read from guest memory"); + /// assert_eq!(16, res); + /// # } + /// ``` + fn read(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> Result { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .read(buf, maddr) + .map_err(Into::into) + } + + fn write_slice(&self, buf: &[u8], addr: MemoryRegionAddress) -> Result<()> { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .write_slice(buf, maddr) + .map_err(Into::into) + } + + fn read_slice(&self, buf: &mut [u8], addr: MemoryRegionAddress) -> Result<()> { + let maddr = addr.raw_value() as usize; + self.as_volatile_slice()? + .read_slice(buf, maddr) + .map_err(Into::into) + } + + fn store( + &self, + val: T, + addr: MemoryRegionAddress, + order: Ordering, + ) -> Result<()> { + self.as_volatile_slice().and_then(|s| { + s.store(val, addr.raw_value() as usize, order) + .map_err(Into::into) + }) + } + + fn load(&self, addr: MemoryRegionAddress, order: Ordering) -> Result { + self.as_volatile_slice() + .and_then(|s| s.load(addr.raw_value() as usize, order).map_err(Into::into)) + } + + // All remaining functions are deprecated and have been removed in vm-memory/main. + // Firecracker does not use them, so no point in writing out implementations here. + fn read_from( + &self, + _addr: MemoryRegionAddress, + _src: &mut F, + _count: usize, + ) -> std::result::Result + where + F: Read, + { + unimplemented!() + } + + fn read_exact_from( + &self, + _addr: MemoryRegionAddress, + _src: &mut F, + _count: usize, + ) -> std::result::Result<(), Self::E> + where + F: Read, + { + unimplemented!() + } + + fn write_to( + &self, + _addr: MemoryRegionAddress, + _dst: &mut F, + _count: usize, + ) -> std::result::Result + where + F: Write, + { + unimplemented!() + } + + fn write_all_to( + &self, + _addr: MemoryRegionAddress, + _dst: &mut F, + _count: usize, + ) -> std::result::Result<(), Self::E> + where + F: Write, + { + unimplemented!() + } +} diff --git a/src/vmm/src/vmm_config/machine_config.rs b/src/vmm/src/vmm_config/machine_config.rs index cfe7105fdf8..77cec0e3b92 100644 --- a/src/vmm/src/vmm_config/machine_config.rs +++ b/src/vmm/src/vmm_config/machine_config.rs @@ -20,6 +20,9 @@ pub enum MachineConfigError { IncompatibleBalloonSize, /// The memory size (MiB) is either 0, or not a multiple of the configured page size. InvalidMemorySize, + /// The specified swiotlb region matches or exceeds the total VM memory, or not a multiple of the configured page size. + #[cfg(target_arch = "aarch64")] + InvalidSwiotlbRegionSize, /// The number of vCPUs must be greater than 0, less than {MAX_SUPPORTED_VCPUS:} and must be 1 or an even number if SMT is enabled. InvalidVcpuCount, /// Could not get the configuration of the previously installed balloon device to validate the memory size. @@ -89,6 +92,17 @@ impl From for Option { } } +/// Structure containing options for tweaking guest memory configuration. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] +pub struct MemoryConfig { + /// The initial size of the swiotlb region. If 0, no swiotlb region will be created. + /// If non-zero, all device will be forced to bounce buffers through a swiotlb region + /// of the specified size that will have been placed into a dedicated kvm memslot. + #[cfg(target_arch = "aarch64")] + #[serde(default)] + pub initial_swiotlb_size: usize, +} + /// Struct used in PUT `/machine-config` API call. #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] #[serde(deny_unknown_fields)] @@ -97,6 +111,10 @@ pub struct MachineConfig { pub vcpu_count: u8, /// The memory size in MiB. pub mem_size_mib: usize, + /// Additional configuration options for guest memory + #[serde(default)] + #[serde(skip_serializing_if = "is_default")] + pub mem_config: MemoryConfig, /// Enables or disabled SMT. #[serde(default)] pub smt: bool, @@ -121,6 +139,10 @@ pub struct MachineConfig { pub gdb_socket_path: Option, } +fn is_default(t: &T) -> bool { + t == &T::default() +} + fn is_none_or_custom_template(template: &Option) -> bool { matches!(template, None | Some(CpuTemplateType::Custom(_))) } @@ -153,6 +175,7 @@ impl Default for MachineConfig { Self { vcpu_count: 1, mem_size_mib: DEFAULT_MEM_SIZE_MIB, + mem_config: Default::default(), smt: false, cpu_template: None, track_dirty_pages: false, @@ -178,6 +201,9 @@ pub struct MachineConfigUpdate { /// The memory size in MiB. #[serde(default)] pub mem_size_mib: Option, + /// The memory configuration + #[serde(default)] + pub mem_config: Option, /// Enables or disabled SMT. #[serde(default)] pub smt: Option, @@ -210,6 +236,7 @@ impl From for MachineConfigUpdate { MachineConfigUpdate { vcpu_count: Some(cfg.vcpu_count), mem_size_mib: Some(cfg.mem_size_mib), + mem_config: Some(cfg.mem_config), smt: Some(cfg.smt), cpu_template: cfg.static_template(), track_dirty_pages: Some(cfg.track_dirty_pages), @@ -263,11 +290,19 @@ impl MachineConfig { let mem_size_mib = update.mem_size_mib.unwrap_or(self.mem_size_mib); let page_config = update.huge_pages.unwrap_or(self.huge_pages); + let mem_config = update.mem_config.unwrap_or(self.mem_config); if mem_size_mib == 0 || !page_config.is_valid_mem_size(mem_size_mib) { return Err(MachineConfigError::InvalidMemorySize); } + #[cfg(target_arch = "aarch64")] + if mem_config.initial_swiotlb_size >= mem_size_mib + || !page_config.is_valid_mem_size(mem_config.initial_swiotlb_size) + { + return Err(MachineConfigError::InvalidSwiotlbRegionSize); + } + let cpu_template = match update.cpu_template { None => self.cpu_template.clone(), Some(StaticCpuTemplate::None) => None, @@ -277,6 +312,7 @@ impl MachineConfig { Ok(MachineConfig { vcpu_count, mem_size_mib, + mem_config, smt, cpu_template, track_dirty_pages: update.track_dirty_pages.unwrap_or(self.track_dirty_pages), @@ -290,7 +326,107 @@ impl MachineConfig { #[cfg(test)] mod tests { use crate::cpu_config::templates::{CpuTemplateType, CustomCpuTemplate, StaticCpuTemplate}; - use crate::vmm_config::machine_config::MachineConfig; + use crate::vmm_config::machine_config::{ + HugePageConfig, MachineConfig, MachineConfigError, MachineConfigUpdate, MemoryConfig, + }; + + #[test] + #[allow(unused)] // some assertions exist only on specific architectures. + fn test_machine_config_update() { + let mconf = MachineConfig::default(); + + // Invalid vCPU counts + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(0), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(33), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + // Check that SMT is not supported on aarch64, and that on x86_64 enabling it requires vcpu + // count to be even. + let res = mconf.update(&MachineConfigUpdate { + smt: Some(true), + ..Default::default() + }); + #[cfg(target_arch = "aarch64")] + assert_eq!(res, Err(MachineConfigError::SmtNotSupported)); + + let res = mconf.update(&MachineConfigUpdate { + vcpu_count: Some(3), + smt: Some(true), + ..Default::default() + }); + #[cfg(target_arch = "x86_64")] + assert_eq!(res, Err(MachineConfigError::InvalidVcpuCount)); + + // Works if the vcpu count is even indeed + let updated = mconf + .update(&MachineConfigUpdate { + vcpu_count: Some(32), + smt: Some(true), + ..Default::default() + }) + .unwrap(); + assert_eq!(updated.vcpu_count, 32); + assert!(updated.smt); + + // Invalid memory size + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(0), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidMemorySize)); + + // Memory Size incompatible with huge page configuration + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(31), + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }); + assert_eq!(res, Err(MachineConfigError::InvalidMemorySize)); + + // works if the memory size is a multiple of huge page size indeed + let updated = mconf + .update(&MachineConfigUpdate { + mem_size_mib: Some(32), + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }) + .unwrap(); + assert_eq!(updated.huge_pages, HugePageConfig::Hugetlbfs2M); + assert_eq!(updated.mem_size_mib, 32); + + // Test swiotlb carve out is larger than total guest memory, and is compatible with huge + // page config + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(32), + mem_config: Some(MemoryConfig { + #[cfg(target_arch = "aarch64")] + initial_swiotlb_size: 64, + }), + ..Default::default() + }); + #[cfg(target_arch = "aarch64")] + assert_eq!(res, Err(MachineConfigError::InvalidSwiotlbRegionSize)); + + let res = mconf.update(&MachineConfigUpdate { + mem_size_mib: Some(32), + mem_config: Some(MemoryConfig { + #[cfg(target_arch = "aarch64")] + initial_swiotlb_size: 15, + }), + huge_pages: Some(HugePageConfig::Hugetlbfs2M), + ..Default::default() + }); + #[cfg(target_arch = "aarch64")] + assert_eq!(res, Err(MachineConfigError::InvalidSwiotlbRegionSize)); + } // Ensure the special (de)serialization logic for the cpu_template field works: // only static cpu templates can be specified via the machine-config endpoint, but diff --git a/src/vmm/src/vstate/memory.rs b/src/vmm/src/vstate/memory.rs index 19367f7f997..7c0c31ac344 100644 --- a/src/vmm/src/vstate/memory.rs +++ b/src/vmm/src/vstate/memory.rs @@ -7,17 +7,20 @@ use std::fs::File; use std::io::SeekFrom; +use std::mem::ManuallyDrop; use std::sync::Arc; +use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; use serde::{Deserialize, Serialize}; pub use vm_memory::bitmap::{AtomicBitmap, BS, Bitmap, BitmapSlice}; pub use vm_memory::mmap::MmapRegionBuilder; use vm_memory::mmap::{MmapRegionError, NewBitmap}; +use vm_memory::volatile_memory::compute_offset; pub use vm_memory::{ Address, ByteValued, Bytes, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, MemoryRegionAddress, MmapRegion, address, }; -use vm_memory::{Error as VmMemoryError, GuestMemoryError, WriteVolatile}; +use vm_memory::{Error as VmMemoryError, GuestMemoryError, VolatileSlice, WriteVolatile}; use vmm_sys_util::errno; use crate::DirtyBitmap; @@ -25,7 +28,7 @@ use crate::utils::{get_page_size, u64_to_usize}; use crate::vmm_config::machine_config::HugePageConfig; /// Type of GuestMemoryMmap. -pub type GuestMemoryMmap = vm_memory::GuestMemoryMmap>; +pub type GuestMemoryMmap = crate::vm_memory_vendored::GuestRegionCollection; /// Type of GuestRegionMmap. pub type GuestRegionMmap = vm_memory::GuestRegionMmap>; /// Type of GuestMmapRegion. @@ -50,14 +53,130 @@ pub enum MemoryError { OffsetTooLarge, } +/// A memory region, described in terms of `kvm_userspace_memory_region` +#[derive(Debug)] +pub struct KvmRegion { + region: kvm_userspace_memory_region, + bitmap: Option, + file_offset: Option, +} + +impl KvmRegion { + /// Constructs a new [`KvmRegion`] from the given [`kvm_userspace_memory_region`] and bitmap. + /// + /// # Safety + /// + /// The caller must guarantee that as long as this `KvmRegion` object is alive, + /// `kvm_region.userspace_addr as *mut u8` is valid for reads and writes of length + /// `kvm_region.memory_size`. + pub unsafe fn new( + region: kvm_userspace_memory_region, + bitmap: Option, + file_offset: Option, + ) -> Self { + KvmRegion { + region, + bitmap, + file_offset, + } + } + + pub(crate) fn from_mmap_region(region: GuestRegionMmap, slot: u32) -> Self { + let region = ManuallyDrop::new(region); + let flags = if region.bitmap().is_some() { + KVM_MEM_LOG_DIRTY_PAGES + } else { + 0 + }; + + // SAFETY: `GuestRegionMmap` is essentially a fat pointer, and ensures that + // region.as_ptr() is valid for reads and writes of length region.len(), + // and by placing our region into a `ManuallyDrop` we ensure that its `Drop` + // impl won't run and free the memory away from underneath us. + unsafe { + Self::new( + kvm_userspace_memory_region { + slot, + flags, + guest_phys_addr: region.start_addr().0, + memory_size: region.len(), + userspace_addr: region.as_ptr() as u64, + }, + region.bitmap().clone(), + region.file_offset().cloned(), + ) + } + } + + pub(crate) fn inner(&self) -> &kvm_userspace_memory_region { + &self.region + } +} + +#[allow(clippy::cast_possible_wrap)] +#[allow(clippy::cast_possible_truncation)] +impl GuestMemoryRegion for KvmRegion { + type B = Option; + + fn len(&self) -> GuestUsize { + self.region.memory_size + } + + fn start_addr(&self) -> GuestAddress { + GuestAddress(self.region.guest_phys_addr) + } + + fn bitmap(&self) -> &Self::B { + &self.bitmap + } + + fn get_host_address( + &self, + addr: MemoryRegionAddress, + ) -> vm_memory::guest_memory::Result<*mut u8> { + self.check_address(addr) + .ok_or(vm_memory::guest_memory::Error::InvalidBackendAddress) + .map(|addr| { + (self.region.userspace_addr as *mut u8).wrapping_offset(addr.raw_value() as isize) + }) + } + + fn file_offset(&self) -> Option<&FileOffset> { + self.file_offset.as_ref() + } + + fn get_slice( + &self, + offset: MemoryRegionAddress, + count: usize, + ) -> vm_memory::guest_memory::Result>> { + let offset = u64_to_usize(offset.0); + let end_addr = compute_offset(offset, count)? as u64; + if end_addr > self.len() { + return Err(vm_memory::guest_memory::Error::InvalidBackendAddress); + } + + // SAFETY: Safe because we checked that offset + count was within our range and we only + // ever hand out volatile accessors. + unsafe { + Ok(VolatileSlice::with_bitmap( + (self.region.userspace_addr as *mut u8).add(offset), + count, + self.bitmap.slice_at(offset), + None, + )) + } + } +} + /// Creates a `Vec` of `GuestRegionMmap` with the given configuration pub fn create( regions: impl Iterator, mmap_flags: libc::c_int, file: Option, track_dirty_pages: bool, + mut offset: u64, ) -> Result, MemoryError> { - let mut offset = 0; let file = file.map(Arc::new); regions .map(|(start, size)| { @@ -105,6 +224,7 @@ pub fn memfd_backed( libc::MAP_SHARED | huge_pages.mmap_flags(), Some(memfd_file), track_dirty_pages, + 0, ) } @@ -119,6 +239,7 @@ pub fn anonymous( libc::MAP_PRIVATE | libc::MAP_ANONYMOUS | huge_pages.mmap_flags(), None, track_dirty_pages, + 0, ) } @@ -128,8 +249,15 @@ pub fn snapshot_file( file: File, regions: impl Iterator, track_dirty_pages: bool, + offset: u64, ) -> Result, MemoryError> { - create(regions, libc::MAP_PRIVATE, Some(file), track_dirty_pages) + create( + regions, + libc::MAP_PRIVATE, + Some(file), + track_dirty_pages, + offset, + ) } /// Defines the interface for snapshotting memory. @@ -227,8 +355,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size().map_err(MemoryError::PageSize)?; - let write_result = self.iter().zip(0..).try_for_each(|(region, slot)| { - let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + let write_result = self.iter().try_for_each(|region| { + let kvm_bitmap = dirty_bitmap.get(®ion.inner().slot).unwrap(); let firecracker_bitmap = region.bitmap(); let mut write_size = 0; let mut dirty_batch_start: u64 = 0; @@ -291,8 +419,8 @@ impl GuestMemoryExtension for GuestMemoryMmap { /// Stores the dirty bitmap inside into the internal bitmap fn store_dirty_bitmap(&self, dirty_bitmap: &DirtyBitmap, page_size: usize) { - self.iter().zip(0..).for_each(|(region, slot)| { - let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + self.iter().for_each(|region| { + let kvm_bitmap = dirty_bitmap.get(®ion.inner().slot).unwrap(); let firecracker_bitmap = region.bitmap(); for (i, v) in kvm_bitmap.iter().enumerate() { @@ -353,6 +481,17 @@ mod tests { use crate::snapshot::Snapshot; use crate::utils::{get_page_size, mib_to_bytes}; + fn kvmify(regions: Vec) -> GuestMemoryMmap { + GuestMemoryMmap::from_regions( + regions + .into_iter() + .zip(0u32..) // assign dummy slots + .map(|(region, slot)| KvmRegion::from_mmap_region(region, slot)) + .collect(), + ) + .unwrap() + } + #[test] fn test_anonymous() { for dirty_page_tracking in [true, false] { @@ -386,10 +525,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + kvmify(anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap()); let dirty_map = [ // page 0: not dirty @@ -430,7 +567,7 @@ mod tests { } } - fn check_serde(guest_memory: &GuestMemoryMmap) { + fn check_serde(guest_memory: &M) { let mut snapshot_data = vec![0u8; 10000]; let original_state = guest_memory.describe(); Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &original_state).unwrap(); @@ -444,15 +581,14 @@ mod tests { let region_size = page_size * 3; // Test with a single region - let guest_memory = GuestMemoryMmap::from_regions( + let guest_memory = kvmify( anonymous( [(GuestAddress(0), region_size)].into_iter(), false, HugePageConfig::None, ) .unwrap(), - ) - .unwrap(); + ); check_serde(&guest_memory); // Test with some regions @@ -461,10 +597,8 @@ mod tests { (GuestAddress(region_size as u64), region_size), // pages 3-5 (GuestAddress(region_size as u64 * 2), region_size), // pages 6-8 ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + kvmify(anonymous(regions.into_iter(), true, HugePageConfig::None).unwrap()); check_serde(&guest_memory); } @@ -477,10 +611,8 @@ mod tests { (GuestAddress(0), page_size), (GuestAddress(page_size as u64 * 2), page_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + kvmify(anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap()); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -503,10 +635,8 @@ mod tests { (GuestAddress(0), page_size * 3), (GuestAddress(page_size as u64 * 4), page_size * 3), ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + kvmify(anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap()); let expected_memory_state = GuestMemoryState { regions: vec![ @@ -537,10 +667,8 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + kvmify(anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap()); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -562,10 +690,8 @@ mod tests { let mut memory_file = TempFile::new().unwrap().into_file(); guest_memory.dump(&mut memory_file).unwrap(); - let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(memory_file, memory_state.regions(), false).unwrap(), - ) - .unwrap(); + let restored_guest_memory = + kvmify(snapshot_file(memory_file, memory_state.regions(), false, 0).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; page_size * 2]; @@ -592,10 +718,8 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + kvmify(anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap()); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { assert!(!r.bitmap().dirty_at(0)); @@ -624,10 +748,8 @@ mod tests { guest_memory.dump_dirty(&mut file, &dirty_bitmap).unwrap(); // We can restore from this because this is the first dirty dump. - let restored_guest_memory = GuestMemoryMmap::from_regions( - snapshot_file(file, memory_state.regions(), false).unwrap(), - ) - .unwrap(); + let restored_guest_memory = + kvmify(snapshot_file(file, memory_state.regions(), false, 0).unwrap()); // Check that the region contents are the same. let mut restored_region = vec![0u8; region_size]; @@ -683,10 +805,8 @@ mod tests { (region_1_address, region_size), (region_2_address, region_size), ]; - let guest_memory = GuestMemoryMmap::from_regions( - anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap(), - ) - .unwrap(); + let guest_memory = + kvmify(anonymous(mem_regions.into_iter(), true, HugePageConfig::None).unwrap()); // Check that Firecracker bitmap is clean. guest_memory.iter().for_each(|r| { diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 7a8965a4b9a..ff6002e00c7 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -11,17 +11,18 @@ use std::io::Write; use std::path::Path; use std::sync::Arc; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; use kvm_ioctls::VmFd; +use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::logger::info; -use crate::persist::CreateSnapshotError; +use crate::persist::{CreateSnapshotError, GuestRegionUffdMapping}; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ - Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, + KvmRegion, }; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; @@ -34,6 +35,8 @@ pub struct VmCommon { max_memslots: usize, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + /// The swiotlb regions of this Vm. + pub swiotlb_regions: GuestMemoryMmap, } /// Errors associated with the wrappers over KVM ioctls. @@ -101,6 +104,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + swiotlb_regions: GuestMemoryMmap::default(), }) } @@ -136,42 +140,58 @@ impl Vm { Ok(()) } - /// Register a new memory region to this [`Vm`]. - pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { + fn kvmify_region(&self, region: GuestRegionMmap) -> Result { let next_slot = self .guest_memory() .num_regions() + .checked_add(self.swiotlb_regions().num_regions()) + .ok_or(VmError::NotEnoughMemorySlots)? .try_into() .map_err(|_| VmError::NotEnoughMemorySlots)?; + if next_slot as usize >= self.common.max_memslots { return Err(VmError::NotEnoughMemorySlots); } - let flags = if region.bitmap().is_some() { - KVM_MEM_LOG_DIRTY_PAGES - } else { - 0 - }; - - let memory_region = kvm_userspace_memory_region { - slot: next_slot, - guest_phys_addr: region.start_addr().raw_value(), - memory_size: region.len(), - userspace_addr: region.as_ptr() as u64, - flags, - }; - - let new_guest_memory = self.common.guest_memory.insert_region(Arc::new(region))?; + Ok(KvmRegion::from_mmap_region(region, next_slot)) + } + fn register_kvm_region(&mut self, region: &KvmRegion) -> Result<(), VmError> { // SAFETY: Safe because the fd is a valid KVM file descriptor. unsafe { self.fd() - .set_user_memory_region(memory_region) + .set_user_memory_region(*region.inner()) .map_err(VmError::SetUserMemoryRegion)?; } + Ok(()) + } + + /// Register a new memory region to this [`Vm`]. + pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { + let arcd_region = Arc::new(self.kvmify_region(region)?); + let new_guest_memory = self + .common + .guest_memory + .insert_region(Arc::clone(&arcd_region))?; + + self.register_kvm_region(arcd_region.as_ref())?; + self.common.guest_memory = new_guest_memory; + Ok(()) + } + + /// Registers a new io memory region to this [`Vm`]. + pub fn register_swiotlb_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { + let arcd_region = Arc::new(self.kvmify_region(region)?); + let new_collection = self + .common + .swiotlb_regions + .insert_region(Arc::clone(&arcd_region))?; + + self.register_kvm_region(arcd_region.as_ref())?; + self.common.swiotlb_regions = new_collection; Ok(()) } @@ -180,32 +200,97 @@ impl Vm { &self.common.fd } - /// Gets a reference to this [`Vm`]'s [`GuestMemoryMmap`] object + /// Gets a reference to this [`Vm`]'s [`GuestMemoryMmap`] object, which + /// contains all non-swiotlb guest memory regions. pub fn guest_memory(&self) -> &GuestMemoryMmap { &self.common.guest_memory } - /// Resets the KVM dirty bitmap for each of the guest's memory regions. - pub fn reset_dirty_bitmap(&self) { + /// Returns a reference to the [`GuestMemoryMmap`] that I/O devices attached to this [`Vm`] + /// have access to. If no I/O regions were registered, return the same as [`Vm::guest_memory`], + /// otherwise returns the [`GuestMemoryMmap`] describing a specific swiotlb region. + pub fn io_memory(&self) -> &GuestMemoryMmap { + if self.has_swiotlb() { + &self.common.swiotlb_regions + } else { + &self.common.guest_memory + } + } + + /// Gets a reference to the [`GuestMemoryMmap`] holding the swiotlb regions registered to + /// this [`Vm`]. Unlike [`Vm::io_memory`], does not fall back to returning the + /// [`GuestMemoryMmap`] of normal memory when no swiotlb regions were registered. + pub fn swiotlb_regions(&self) -> &GuestMemoryMmap { + &self.common.swiotlb_regions + } + + /// Returns `true` iff any io memory regions where registered via [`Vm::register_io_region`]. + pub fn has_swiotlb(&self) -> bool { + self.common.swiotlb_regions.num_regions() > 0 + } + + /// Returns an iterator over all regions, normal and swiotlb. + fn all_regions(&self) -> impl Iterator { self.guest_memory() .iter() - .zip(0u32..) - .for_each(|(region, slot)| { - let _ = self.fd().get_dirty_log(slot, u64_to_usize(region.len())); + .chain(self.common.swiotlb_regions.iter()) + } + + pub(crate) fn create_uffd( + &self, + ) -> Result<(Uffd, Vec), userfaultfd::Error> { + let mut uffd_builder = UffdBuilder::new(); + let mut mappings = Vec::new(); + + // We only make use of this if balloon devices are present, but we can enable it unconditionally + // because the only place the kernel checks this is in a hook from madvise, e.g. it doesn't + // actively change the behavior of UFFD, only passively. Without balloon devices + // we never call madvise anyway, so no need to put this into a conditional. + uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + + let uffd = uffd_builder + .close_on_exec(true) + .non_blocking(true) + .user_mode_only(false) + .create()?; + + let mut offset = 0; + + for mem_region in self.common.guest_memory.iter() { + uffd.register( + mem_region.inner().userspace_addr as *mut libc::c_void, + u64_to_usize(mem_region.len()), + )?; + mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: mem_region.inner().userspace_addr, + size: u64_to_usize(mem_region.len()), + offset, + ..Default::default() }); + + offset += mem_region.len(); + } + + Ok((uffd, mappings)) + } + + /// Resets the KVM dirty bitmap for each of the guest's memory regions. + pub fn reset_dirty_bitmap(&self) { + self.all_regions().for_each(|region| { + let _ = self + .fd() + .get_dirty_log(region.inner().slot, u64_to_usize(region.len())); + }); } /// Retrieves the KVM dirty bitmap for each of the guest's memory regions. pub fn get_dirty_bitmap(&self) -> Result { let mut bitmap: DirtyBitmap = HashMap::new(); - self.guest_memory() - .iter() - .zip(0u32..) - .try_for_each(|(region, slot)| { - self.fd() - .get_dirty_log(slot, u64_to_usize(region.len())) - .map(|bitmap_region| _ = bitmap.insert(slot, bitmap_region)) - })?; + self.all_regions().try_for_each(|region| { + self.fd() + .get_dirty_log(region.inner().slot, u64_to_usize(region.len())) + .map(|bitmap_region| _ = bitmap.insert(region.inner().slot, bitmap_region)) + })?; Ok(bitmap) } @@ -262,10 +347,14 @@ impl Vm { match snapshot_type { SnapshotType::Diff => { let dirty_bitmap = self.get_dirty_bitmap()?; - self.guest_memory().dump_dirty(&mut file, &dirty_bitmap)?; + self.guest_memory() + .dump_dirty(&mut file, &dirty_bitmap) + .and_then(|_| self.swiotlb_regions().dump_dirty(&mut file, &dirty_bitmap))?; } SnapshotType::Full => { - self.guest_memory().dump(&mut file)?; + self.guest_memory() + .dump(&mut file) + .and_then(|_| self.swiotlb_regions().dump(&mut file))?; self.reset_dirty_bitmap(); self.guest_memory().reset_dirty(); } diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 55fb07c1aae..01ba8d705be 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -7,7 +7,9 @@ use std::time::Duration; use vmm::builder::build_and_boot_microvm; use vmm::devices::virtio::block::CacheType; -use vmm::persist::{MicrovmState, MicrovmStateError, VmInfo, snapshot_state_sanity_check}; +use vmm::persist::{ + MicrovmState, MicrovmStateError, RestoreMemoryError, VmInfo, snapshot_state_sanity_check, +}; use vmm::resources::VmResources; use vmm::rpc_interface::{ LoadSnapshotError, PrebootApiController, RuntimeApiController, VmmAction, VmmActionError, @@ -295,8 +297,6 @@ fn test_create_and_load_snapshot() { #[test] fn test_snapshot_load_sanity_checks() { - use vmm::persist::SnapShotStateSanityCheckError; - let mut microvm_state = get_microvm_state_from_snapshot(); snapshot_state_sanity_check(µvm_state).unwrap(); @@ -305,10 +305,10 @@ fn test_snapshot_load_sanity_checks() { microvm_state.vm_state.memory.regions.clear(); // Validate sanity checks fail because there is no mem region in state. - assert_eq!( + assert!(matches!( snapshot_state_sanity_check(µvm_state), - Err(SnapShotStateSanityCheckError::NoMemory) - ); + Err(RestoreMemoryError::NoMemory) + )); } fn get_microvm_state_from_snapshot() -> MicrovmState { diff --git a/tests/conftest.py b/tests/conftest.py index 2af0d04ec6a..67f71baaf66 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -376,6 +376,16 @@ def io_engine(request): return request.param +@pytest.fixture( + params=[None, 16, 32, 64] if platform.machine() == "aarch64" else [None] +) +def memory_config(request): + """Differently configured swiotlb regions. Only supported on aarch64""" + if request.param is None: + return None + return {"initial_swiotlb_size": request.param} + + @pytest.fixture def results_dir(request): """ diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index f8457e2e068..7cc99294557 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -248,6 +248,7 @@ def __init__( self.disks_vhost_user = {} self.vcpus_count = None self.mem_size_bytes = None + self.memory_config = None self.cpu_template_name = None # The given custom CPU template will be set in basic_config() but could # be overwritten via set_cpu_template(). @@ -462,6 +463,7 @@ def dimensions(self): "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + **(self.memory_config if self.memory_config else {}), } @property @@ -729,6 +731,7 @@ def basic_config( rootfs_io_engine=None, cpu_template: Optional[str] = None, enable_entropy_device=False, + memory_config=None, ): """Shortcut for quickly configuring a microVM. @@ -747,15 +750,23 @@ def basic_config( Reference: file:../../src/vmm/src/vmm_config/boot_source.rs::DEFAULT_KERNEL_CMDLINE """ + # Have to do it this way as otherwise A/B-tests fail if the 'A' revision + # of Firecracker doesn't knwo about the mem_config parameter. + kwargs = {} + if memory_config is not None: + kwargs["mem_config"] = memory_config + self.api.machine_config.put( vcpu_count=vcpu_count, smt=smt, mem_size_mib=mem_size_mib, track_dirty_pages=track_dirty_pages, huge_pages=huge_pages, + **kwargs, ) self.vcpus_count = vcpu_count self.mem_size_bytes = mem_size_mib * 2**20 + self.memory_config = memory_config if self.custom_cpu_template is not None: self.set_cpu_template(self.custom_cpu_template) diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index deb9b2dd6c3..e589acb943b 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -338,9 +338,7 @@ def test_negative_snapshot_permissions(uvm_plain_rw, microvm_factory): microvm.spawn() expected_err = re.escape( - "Load snapshot error: Failed to restore from snapshot: Failed to load guest " - "memory: Error creating guest memory from file: Failed to load guest memory: " - "Permission denied (os error 13)" + "Failed to open memory backend file: Permission denied (os error 13)" ) with pytest.raises(RuntimeError, match=expected_err): microvm.restore_from_snapshot(snapshot, resume=True) diff --git a/tests/integration_tests/functional/test_swiotlb.py b/tests/integration_tests/functional/test_swiotlb.py new file mode 100644 index 00000000000..7a58d9045a1 --- /dev/null +++ b/tests/integration_tests/functional/test_swiotlb.py @@ -0,0 +1,65 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Test swiotlb related functionality.""" +import re + +import pytest + +from framework.properties import global_props + +pytestmark = pytest.mark.skipif( + global_props.cpu_architecture != "aarch64", reason="swiotlb only supported on ARM" +) + + +def test_swiotlb_boot(microvm_factory, guest_kernel_linux_6_1, rootfs): + """Tests that a VM can boot if all virtio devices are bound to a swiotlb region, and + that this swiotlb region is actually discovered by the guest.""" + vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + vm.spawn() + vm.basic_config(memory_config={"initial_swiotlb_size": 64}) + vm.add_net_iface() + vm.start() + + _, dmesg, _ = vm.ssh.check_output("dmesg") + + assert ( + "OF: reserved mem: initialized node bouncy_boi, compatible id restricted-dma-pool" + in dmesg + ) + + match = re.search(r"Placed swiotlb region at \[(\d+), (\d+)\)", vm.log_data) + + assert match is not None, "Firecracker did not print swiotlb region placement" + + swiotlb_start, swiotlb_end = match.group(1, 2) + + found_any = False + + for match in re.finditer( + r"Placed virt queue ([a-zA-Z ]+) at \[(\d+), (\d+)\)", vm.log_data + ): + found_any = True + component, start, end = match.group(1, 2, 3) + + assert ( + start >= swiotlb_start and end <= swiotlb_end + ), f"Guest placed virtio queue component {component} outside of swiotlb region!" + + assert found_any, "Did not find any virtio devices in Firecracker logs" + + +def test_swiotlb_snapshot(microvm_factory, guest_kernel_linux_6_1, rootfs): + """Tests that a VM with swiotlb regions attached can be snapshotted and restored + again, and that the restored VM can still do I/O.""" + vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + vm.spawn() + vm.basic_config(memory_config={"initial_swiotlb_size": 64}) + vm.add_net_iface() + vm.start() + snapshot = vm.snapshot_full() + vm.kill() + + vm = microvm_factory.build_from_snapshot(snapshot) + + vm.ssh.check_output("true") diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index e003807d282..2dfa598ae50 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -44,9 +44,7 @@ def test_bad_socket_path(uvm_plain, snapshot): jailed_vmstate = vm.create_jailed_resource(snapshot.vmstate) expected_msg = re.escape( - "Load snapshot error: Failed to restore from snapshot: Failed to load guest " - "memory: Error creating guest memory from uffd: Failed to connect to UDS Unix stream: No " - "such file or directory (os error 2)" + "Failure to send UFFD handshake to handler process: No such file or directory (os error 2)" ) with pytest.raises(RuntimeError, match=expected_msg): vm.api.snapshot_load.put( @@ -70,10 +68,9 @@ def test_unbinded_socket(uvm_plain, snapshot): jailed_sock_path = vm.create_jailed_resource(socket_path) expected_msg = re.escape( - "Load snapshot error: Failed to restore from snapshot: Failed to load guest " - "memory: Error creating guest memory from uffd: Failed to connect to UDS Unix stream: " - "Connection refused (os error 111)" + "Failure to send UFFD handshake to handler process: Connection refused (os error 111)" ) + with pytest.raises(RuntimeError, match=expected_msg): vm.api.snapshot_load.put( mem_backend={"backend_type": "Uffd", "backend_path": jailed_sock_path}, diff --git a/tests/integration_tests/performance/test_block_ab.py b/tests/integration_tests/performance/test_block_ab.py index b10a41b7c85..2c81d97cdb1 100644 --- a/tests/integration_tests/performance/test_block_ab.py +++ b/tests/integration_tests/performance/test_block_ab.py @@ -148,14 +148,20 @@ def test_block_performance( fio_mode, fio_block_size, io_engine, + memory_config, metrics, ): """ Execute block device emulation benchmarking scenarios. """ + if memory_config is not None and "6.1" not in guest_kernel_acpi: + pytest.skip("swiotlb only supported on aarch64/6.1") + vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) + vm.basic_config( + vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB, memory_config=memory_config + ) vm.add_net_iface() # Add a secondary block device for benchmark tests. fs = drive_tools.FilesystemFile( diff --git a/tests/integration_tests/performance/test_network_ab.py b/tests/integration_tests/performance/test_network_ab.py index 3a50d864544..9012164dcb3 100644 --- a/tests/integration_tests/performance/test_network_ab.py +++ b/tests/integration_tests/performance/test_network_ab.py @@ -36,16 +36,21 @@ def consume_ping_output(ping_putput, request_per_round): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, memory_config): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" + if memory_config is not None and "6.1" not in guest_kernel_acpi: + pytest.skip("swiotlb only supported on aarch64/6.1") + guest_mem_mib = 1024 guest_vcpus = request.param vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) + vm.basic_config( + vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib, memory_config=memory_config + ) vm.add_net_iface() vm.start() vm.pin_threads(0) diff --git a/tests/integration_tests/performance/test_vsock_ab.py b/tests/integration_tests/performance/test_vsock_ab.py index bcee05528af..e3d91900c92 100644 --- a/tests/integration_tests/performance/test_vsock_ab.py +++ b/tests/integration_tests/performance/test_vsock_ab.py @@ -73,7 +73,14 @@ def guest_command(self, port_offset): @pytest.mark.parametrize("payload_length", ["64K", "1024K"], ids=["p64K", "p1024K"]) @pytest.mark.parametrize("mode", ["g2h", "h2g", "bd"]) def test_vsock_throughput( - microvm_factory, guest_kernel_acpi, rootfs, vcpus, payload_length, mode, metrics + microvm_factory, + guest_kernel_acpi, + rootfs, + vcpus, + payload_length, + mode, + metrics, + memory_config, ): """ Test vsock throughput for multiple vm configurations. @@ -84,10 +91,15 @@ def test_vsock_throughput( if mode == "bd" and vcpus < 2: pytest.skip("bidrectional test only done with at least 2 vcpus") + if memory_config is not None and "6.1" not in guest_kernel_acpi: + pytest.skip("swiotlb only supported on aarch64/6.1") + mem_size_mib = 1024 vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) vm.spawn(log_level="Info", emit_metrics=True) - vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) + vm.basic_config( + vcpu_count=vcpus, mem_size_mib=mem_size_mib, memory_config=memory_config + ) vm.add_net_iface() # Create a vsock device vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path="/" + VSOCK_UDS_PATH) diff --git a/tools/devtool b/tools/devtool index 57637a553cc..369e9262268 100755 --- a/tools/devtool +++ b/tools/devtool @@ -570,7 +570,7 @@ ensure_ci_artifacts() { # Fetch all the artifacts so they are local say "Fetching CI artifacts from S3" FC_VERSION=$(cmd_sh "cd src/firecracker/src; cargo pkgid | cut -d# -f2 | cut -d. -f1-2") - S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION-secret-hiding/$(uname -m) ARTIFACTS=$MICROVM_IMAGES_DIR/$(uname -m) if [ ! -d "$ARTIFACTS" ]; then mkdir -pv $ARTIFACTS diff --git a/tools/setup-ci-artifacts.sh b/tools/setup-ci-artifacts.sh index 0d524658b51..079eda888d8 100755 --- a/tools/setup-ci-artifacts.sh +++ b/tools/setup-ci-artifacts.sh @@ -12,7 +12,7 @@ say "Setup CI artifacts" cd build/img/$(uname -m) say "Fix executable permissions" -find "firecracker" -type f |xargs chmod -c 755 +find "firecracker" -type f |xargs chmod -c 755 || true say "Generate SSH key to connect from host" if [ ! -s id_rsa ]; then