From 8affa1379be7f022c7100fd9e91a81001fc6c97d Mon Sep 17 00:00:00 2001 From: Matias Ezequiel Vara Larsen Date: Mon, 29 Jul 2024 11:41:26 -0400 Subject: [PATCH 1/2] Use create_guest_memfd() and set_user_memory_region2() Signed-off-by: Matias Ezequiel Vara Larsen --- src/vmm/src/builder.rs | 4 +- src/vmm/src/linux/vstate.rs | 83 ++++++++++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 8eda82a1..904ed6b4 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -904,7 +904,7 @@ pub(crate) fn setup_vm( let mut vm = Vm::new(kvm.fd()) .map_err(Error::Vm) .map_err(StartMicrovmError::Internal)?; - vm.memory_init(guest_memory, kvm.max_memslots()) + vm.memory_init(guest_memory, kvm.max_memslots(), false) .map_err(Error::Vm) .map_err(StartMicrovmError::Internal)?; Ok(vm) @@ -918,7 +918,7 @@ pub(crate) fn setup_vm( let mut vm = Vm::new(kvm.fd(), tee_config) .map_err(Error::Vm) .map_err(StartMicrovmError::Internal)?; - vm.memory_init(guest_memory, kvm.max_memslots()) + vm.memory_init(guest_memory, kvm.max_memslots(), false) .map_err(Error::Vm) .map_err(StartMicrovmError::Internal)?; Ok(vm) diff --git a/src/vmm/src/linux/vstate.rs b/src/vmm/src/linux/vstate.rs index 77726880..f86cdd47 100644 --- a/src/vmm/src/linux/vstate.rs +++ b/src/vmm/src/linux/vstate.rs @@ -10,6 +10,7 @@ use libc::{c_int, c_void, siginfo_t}; use std::cell::Cell; use std::fmt::{Display, Formatter}; use std::io; +use std::os::fd::RawFd; #[cfg(feature = "tee")] use std::os::unix::io::RawFd; @@ -45,7 +46,10 @@ use kvm_bindings::{ Msrs, KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_MAX_CPUID_ENTRIES, KVM_PIT_SPEAKER_DUMMY, }; -use kvm_bindings::{kvm_userspace_memory_region, KVM_API_VERSION}; +use kvm_bindings::{ + kvm_create_guest_memfd, kvm_userspace_memory_region, kvm_userspace_memory_region2, + KVM_API_VERSION, KVM_MEM_GUEST_MEMFD, +}; use kvm_ioctls::*; use utils::eventfd::EventFd; use utils::signal::{register_signal_handler, sigrtmin, Killable}; @@ -112,6 +116,10 @@ pub enum Error { SetUserMemoryRegion(kvm_ioctls::Error), /// Error creating memory map for SHM region. ShmMmap(io::Error), + /// Cannot set the memory regions. + SetUserMemoryRegion2(kvm_ioctls::Error), + /// Cannot create guest memfd. + CreateGuestMemfd(kvm_ioctls::Error), #[cfg(feature = "amd-sev")] /// Error initializing the Secure Virtualization Backend (SEV). SevSecVirtInit(SevError), @@ -272,6 +280,8 @@ impl Display for Error { ), SetUserMemoryRegion(e) => write!(f, "Cannot set the memory regions: {e}"), ShmMmap(e) => write!(f, "Error creating memory map for SHM region: {e}"), + SetUserMemoryRegion2(e) => write!(f, "Cannot set the memory regions: {e}"), + CreateGuestMemfd(e) => write!(f, "Cannot create guest memfd: {e}"), #[cfg(feature = "tee")] SevSecVirtInit(e) => { write!( @@ -554,6 +564,7 @@ impl Vm { &mut self, guest_mem: &GuestMemoryMmap, kvm_max_memslots: usize, + require_guest_memfd: bool, ) -> Result<()> { if guest_mem.num_regions() > kvm_max_memslots { return Err(Error::NotEnoughMemorySlots); @@ -561,20 +572,54 @@ impl Vm { for region in guest_mem.iter() { // It's safe to unwrap because the guest address is valid. let host_addr = guest_mem.get_host_address(region.start_addr()).unwrap(); - debug!("Guest memory starts at {:x?}", host_addr); - let memory_region = kvm_userspace_memory_region { - slot: self.next_mem_slot, - guest_phys_addr: region.start_addr().raw_value(), - memory_size: region.len(), - userspace_addr: host_addr as u64, - flags: 0, - }; - // Safe because we mapped the memory region, we made sure that the regions - // are not overlapping. - unsafe { - self.fd - .set_user_memory_region(memory_region) - .map_err(Error::SetUserMemoryRegion)?; + info!("Guest memory starts at {:x?}", host_addr); + + if require_guest_memfd { + let gmem = kvm_create_guest_memfd { + size: region.len(), + flags: 0, + reserved: [0; 6], + }; + + let id: RawFd = self + .fd + .create_guest_memfd(gmem) + .map_err(Error::CreateGuestMemfd)?; + + let memory_region = kvm_userspace_memory_region2 { + slot: self.next_mem_slot as u32, + flags: KVM_MEM_GUEST_MEMFD, + guest_phys_addr: region.start_addr().raw_value(), + memory_size: region.len(), + userspace_addr: host_addr as u64, + guest_memfd_offset: 0, + guest_memfd: id as u32, + pad1: 0, + pad2: [0; 14], + }; + + // Safe because we mapped the memory region, we made sure that the regions + // are not overlapping. + unsafe { + self.fd + .set_user_memory_region2(memory_region) + .map_err(Error::SetUserMemoryRegion2)?; + }; + } else { + let memory_region = kvm_userspace_memory_region { + slot: self.next_mem_slot as u32, + guest_phys_addr: region.start_addr().raw_value(), + memory_size: region.len(), + userspace_addr: host_addr as u64, + flags: 0, + }; + // Safe because we mapped the memory region, we made sure that the regions + // are not overlapping. + unsafe { + self.fd + .set_user_memory_region(memory_region) + .map_err(Error::SetUserMemoryRegion)?; + }; }; self.next_mem_slot += 1; } @@ -1510,7 +1555,7 @@ mod tests { let kvm = KvmContext::new().unwrap(); let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), mem_size)]).unwrap(); let mut vm = Vm::new(kvm.fd()).expect("Cannot create new vm"); - assert!(vm.memory_init(&gm, kvm.max_memslots()).is_ok()); + assert!(vm.memory_init(&gm, kvm.max_memslots(), false).is_ok()); let exit_evt = EventFd::new(utils::eventfd::EFD_NONBLOCK).unwrap(); @@ -1565,7 +1610,7 @@ mod tests { // Create valid memory region and test that the initialization is successful. let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x1000)]).unwrap(); - assert!(vm.memory_init(&gm, kvm_context.max_memslots()).is_ok()); + assert!(vm.memory_init(&gm, kvm_context.max_memslots(), false).is_ok()); // Set the maximum number of memory slots to 1 in KvmContext to check the error // path of memory_init. Create 2 non-overlapping memory slots. @@ -1575,7 +1620,7 @@ mod tests { (GuestAddress(0x1001), 0x2000), ]) .unwrap(); - assert!(vm.memory_init(&gm, kvm_context.max_memslots()).is_err()); + assert!(vm.memory_init(&gm, kvm_context.max_memslots(), false).is_err()); } #[cfg(target_arch = "x86_64")] @@ -1656,7 +1701,7 @@ mod tests { let kvm = KvmContext::new().unwrap(); let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); let mut vm = Vm::new(kvm.fd()).expect("new vm failed"); - assert!(vm.memory_init(&gm, kvm.max_memslots()).is_ok()); + assert!(vm.memory_init(&gm, kvm.max_memslots(), false).is_ok()); // Try it for when vcpu id is 0. let mut vcpu = Vcpu::new_aarch64( From 8b97dc4d256dddb616b70fc64527b114234debb1 Mon Sep 17 00:00:00 2001 From: Matias Ezequiel Vara Larsen Date: Fri, 9 Aug 2024 04:50:26 -0400 Subject: [PATCH 2/2] Support ARM CCA feature Enable to build confidential guests using ARM CCA (Confidential Computing Architecture). This work relies on v7 series for Linux and v5 series for KVM. This has been tested only on the corresponding FVP model simulator. For testing, you require specific kvm-ioctls and kvm-bindings crates. Signed-off-by: Matias Ezequiel Vara Larsen --- Makefile | 3 + src/arch/Cargo.toml | 6 +- src/arch/src/aarch64/fdt.rs | 3 + src/arch/src/aarch64/linux/regs.rs | 4 +- src/cpuid/Cargo.toml | 4 +- src/devices/Cargo.toml | 2 + src/devices/src/virtio/console/device.rs | 15 ++- src/devices/src/virtio/console/mod.rs | 1 + src/devices/src/virtio/fs/device.rs | 13 +- src/devices/src/virtio/rng/device.rs | 7 +- src/libkrun/Cargo.toml | 2 + src/libkrun/src/lib.rs | 68 ++++++++++ src/vmm/Cargo.toml | 8 +- src/vmm/src/builder.rs | 161 +++++++++++++++++++++-- src/vmm/src/lib.rs | 3 + src/vmm/src/linux/vstate.rs | 128 ++++++++++++++++-- 16 files changed, 388 insertions(+), 40 deletions(-) diff --git a/Makefile b/Makefile index 36f98074..994730ce 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,9 @@ ifeq ($(SEV),1) INIT_SRC += $(SNP_INIT_SRC) BUILD_INIT = 0 endif +ifeq ($(CCA), 1) + FEATURE_FLAGS := --features cca +endif ifeq ($(GPU),1) FEATURE_FLAGS += --features gpu endif diff --git a/src/arch/Cargo.toml b/src/arch/Cargo.toml index baaedda5..cb183142 100644 --- a/src/arch/Cargo.toml +++ b/src/arch/Cargo.toml @@ -5,6 +5,8 @@ authors = ["The Chromium OS Authors"] edition = "2021" [features] +default = ["cca"] +cca = [] tee = [] amd-sev = [ "tee" ] efi = [] @@ -18,8 +20,8 @@ smbios = { path = "../smbios" } utils = { path = "../utils" } [target.'cfg(target_os = "linux")'.dependencies] -kvm-bindings = { version = ">=0.8", features = ["fam-wrappers"] } -kvm-ioctls = ">=0.17" +kvm-bindings = { version = ">=0.8", features = ["fam-wrappers"] , git = "https://github.com/virtee/kvm-bindings", branch = "add_bindings_for_realms" } +kvm-ioctls = { version = ">=0.17", git = "https://github.com/virtee/kvm-ioctls", branch = "cca" } [target.'cfg(target_arch = "aarch64")'.dependencies] vm-fdt = ">= 0.2.0" diff --git a/src/arch/src/aarch64/fdt.rs b/src/arch/src/aarch64/fdt.rs index 02b45112..4e051506 100644 --- a/src/arch/src/aarch64/fdt.rs +++ b/src/arch/src/aarch64/fdt.rs @@ -285,7 +285,10 @@ fn create_psci_node(fdt: &mut FdtWriter) -> Result<()> { // Two methods available: hvc and smc. // As per documentation, PSCI calls between a guest and hypervisor may use the HVC conduit instead of SMC. // So, since we are using kvm, we need to use hvc. + #[cfg(not(feature = "cca"))] fdt.property_string("method", "hvc")?; + #[cfg(feature = "cca")] + fdt.property_string("method", "smc")?; fdt.end_node(node)?; Ok(()) diff --git a/src/arch/src/aarch64/linux/regs.rs b/src/arch/src/aarch64/linux/regs.rs index 81146b8a..71dffe5b 100644 --- a/src/arch/src/aarch64/linux/regs.rs +++ b/src/arch/src/aarch64/linux/regs.rs @@ -125,8 +125,10 @@ arm64_sys_reg!(MPIDR_EL1, 3, 0, 0, 0, 5); /// * `boot_ip` - Starting instruction pointer. /// * `mem` - Reserved DRAM for current VM. pub fn setup_regs(vcpu: &VcpuFd, cpu_id: u8, boot_ip: u64, mem: &GuestMemoryMmap) -> Result<()> { - // Get the register index of the PSTATE (Processor State) register. + // PSTATE cannot be accesed from the host in CCA + #[cfg(not(feature = "cca"))] #[allow(deref_nullptr)] + // Get the register index of the PSTATE (Processor State) register. vcpu.set_one_reg(arm64_core_reg!(pstate), &PSTATE_FAULT_BITS_64.to_le_bytes()) .map_err(Error::SetCoreRegister)?; diff --git a/src/cpuid/Cargo.toml b/src/cpuid/Cargo.toml index 41c53aee..d1e3214e 100644 --- a/src/cpuid/Cargo.toml +++ b/src/cpuid/Cargo.toml @@ -8,5 +8,5 @@ edition = "2021" vmm-sys-util = ">=0.11" [target.'cfg(target_os = "linux")'.dependencies] -kvm-bindings = { version = ">=0.8", features = ["fam-wrappers"] } -kvm-ioctls = ">=0.17" +kvm-bindings = { version = ">=0.8", features = ["fam-wrappers"] , git = "https://github.com/virtee/kvm-bindings", branch = "add_bindings_for_realms" } +kvm-ioctls = { version = ">=0.17", git = "https://github.com/virtee/kvm-ioctls", branch = "cca" } diff --git a/src/devices/Cargo.toml b/src/devices/Cargo.toml index 7e516346..b8eef4c4 100644 --- a/src/devices/Cargo.toml +++ b/src/devices/Cargo.toml @@ -5,7 +5,9 @@ authors = ["The Chromium OS Authors"] edition = "2021" [features] +default = ["cca"] tee = [] +cca = [] amd-sev = ["blk", "tee"] net = [] blk = [] diff --git a/src/devices/src/virtio/console/device.rs b/src/devices/src/virtio/console/device.rs index 6bbdda83..80445dad 100644 --- a/src/devices/src/virtio/console/device.rs +++ b/src/devices/src/virtio/console/device.rs @@ -30,9 +30,18 @@ use crate::virtio::{PortDescription, VmmExitObserver}; pub(crate) const CONTROL_RXQ_INDEX: usize = 2; pub(crate) const CONTROL_TXQ_INDEX: usize = 3; -pub(crate) const AVAIL_FEATURES: u64 = 1 << uapi::VIRTIO_CONSOLE_F_SIZE as u64 - | 1 << uapi::VIRTIO_CONSOLE_F_MULTIPORT as u64 - | 1 << uapi::VIRTIO_F_VERSION_1 as u64; +// CCA requires VIRTIO_F_ACCESS_PLATFORM to ensure DMA-APIs +// are triggered for virtio in Linux +pub(crate) const AVAIL_FEATURES: u64 = if cfg!(feature = "cca") { + 1 << uapi::VIRTIO_CONSOLE_F_SIZE as u64 + | 1 << uapi::VIRTIO_CONSOLE_F_MULTIPORT as u64 + | 1 << uapi::VIRTIO_F_VERSION_1 as u64 + | 1 << uapi::VIRTIO_F_ACCESS_PLATFORM as u64 +} else { + 1 << uapi::VIRTIO_CONSOLE_F_SIZE as u64 + | 1 << uapi::VIRTIO_CONSOLE_F_MULTIPORT as u64 + | 1 << uapi::VIRTIO_F_VERSION_1 as u64 +}; #[repr(C)] #[derive(Default)] diff --git a/src/devices/src/virtio/console/mod.rs b/src/devices/src/virtio/console/mod.rs index bbaba4dd..c6d0fb9d 100644 --- a/src/devices/src/virtio/console/mod.rs +++ b/src/devices/src/virtio/console/mod.rs @@ -22,6 +22,7 @@ mod defs { pub const VIRTIO_CONSOLE_F_MULTIPORT: u32 = 1; pub const VIRTIO_F_VERSION_1: u32 = 32; pub const VIRTIO_ID_CONSOLE: u32 = 3; + pub const VIRTIO_F_ACCESS_PLATFORM: u32 = 33; } #[allow(dead_code)] diff --git a/src/devices/src/virtio/fs/device.rs b/src/devices/src/virtio/fs/device.rs index 9d7a21e0..360bf721 100644 --- a/src/devices/src/virtio/fs/device.rs +++ b/src/devices/src/virtio/fs/device.rs @@ -9,7 +9,10 @@ use std::thread::JoinHandle; #[cfg(target_os = "macos")] use hvf::MemoryMapping; use utils::eventfd::{EventFd, EFD_NONBLOCK}; -use virtio_bindings::{virtio_config::VIRTIO_F_VERSION_1, virtio_ring::VIRTIO_RING_F_EVENT_IDX}; +use virtio_bindings::{ + virtio_config::VIRTIO_F_ACCESS_PLATFORM, virtio_config::VIRTIO_F_VERSION_1, + virtio_ring::VIRTIO_RING_F_EVENT_IDX, +}; use vm_memory::{ByteValued, GuestMemoryMmap}; use super::super::{ @@ -70,7 +73,13 @@ impl Fs { .push(EventFd::new(utils::eventfd::EFD_NONBLOCK).map_err(FsError::EventFd)?); } - let avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); + let avail_features = if cfg!(feature = "cca") { + (1u64 << VIRTIO_F_VERSION_1) + | (1u64 << VIRTIO_RING_F_EVENT_IDX) + | (1 << VIRTIO_F_ACCESS_PLATFORM as u64) + } else { + (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX) + }; let tag = fs_id.into_bytes(); let mut config = VirtioFsConfig::default(); diff --git a/src/devices/src/virtio/rng/device.rs b/src/devices/src/virtio/rng/device.rs index eafcb06b..977e6313 100644 --- a/src/devices/src/virtio/rng/device.rs +++ b/src/devices/src/virtio/rng/device.rs @@ -13,12 +13,17 @@ use super::super::{ use super::{defs, defs::uapi}; use crate::legacy::GicV3; use crate::Error as DeviceError; +use virtio_bindings::virtio_config::VIRTIO_F_ACCESS_PLATFORM; // Request queue. pub(crate) const REQ_INDEX: usize = 0; // Supported features. -pub(crate) const AVAIL_FEATURES: u64 = 1 << uapi::VIRTIO_F_VERSION_1 as u64; +pub(crate) const AVAIL_FEATURES: u64 = if cfg!(feature = "cca") { + 1 << uapi::VIRTIO_F_VERSION_1 as u64 | 1 << VIRTIO_F_ACCESS_PLATFORM as u64 +} else { + 1 << uapi::VIRTIO_F_VERSION_1 as u64 +}; #[derive(Copy, Clone, Debug, Default)] #[repr(C, packed)] diff --git a/src/libkrun/Cargo.toml b/src/libkrun/Cargo.toml index e1234751..d81834ba 100644 --- a/src/libkrun/Cargo.toml +++ b/src/libkrun/Cargo.toml @@ -16,12 +16,14 @@ snd = [] virgl_resource_map2 = [] [dependencies] +vm-memory = { version = ">=0.13", features = ["backend-mmap"] } crossbeam-channel = "0.5" env_logger = "0.9.0" libc = ">=0.2.39" log = "0.4.0" once_cell = "1.4.1" +kvm-bindings = { version = ">=0.8", features = ["fam-wrappers"] , git = "https://github.com/virtee/kvm-bindings", branch = "add_bindings_for_realms" } devices = { path = "../devices" } polly = { path = "../polly" } utils = { path = "../utils" } diff --git a/src/libkrun/src/lib.rs b/src/libkrun/src/lib.rs index 9af22c51..5eb0355c 100644 --- a/src/libkrun/src/lib.rs +++ b/src/libkrun/src/lib.rs @@ -1,6 +1,13 @@ #[macro_use] extern crate log; +use crossbeam_channel::unbounded; +use kvm_bindings::kvm_memory_attributes; +use libc::fallocate; +use libc::madvise; +use libc::FALLOC_FL_KEEP_SIZE; +use libc::FALLOC_FL_PUNCH_HOLE; +use libc::MADV_DONTNEED; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::convert::TryInto; @@ -11,10 +18,13 @@ use std::ffi::CString; #[cfg(target_os = "linux")] use std::os::fd::AsRawFd; use std::os::fd::RawFd; +use std::os::raw::c_void; use std::path::PathBuf; use std::slice; use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::Mutex; +use vm_memory::GuestMemoryRegion; +use vm_memory::{Address, GuestMemory}; #[cfg(target_os = "macos")] use crossbeam_channel::unbounded; @@ -1225,9 +1235,12 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { #[cfg(target_os = "macos")] let (sender, receiver) = unbounded(); + let (io_sender, receiver) = unbounded(); + let _vmm = match vmm::builder::build_microvm( &ctx_cfg.vmr, &mut event_manager, + io_sender, ctx_cfg.shutdown_efd, #[cfg(target_os = "macos")] sender, @@ -1242,6 +1255,61 @@ pub extern "C" fn krun_start_enter(ctx_id: u32) -> i32 { #[cfg(target_os = "macos")] let mapper_vmm = _vmm.clone(); + let vm = _vmm.lock().unwrap().kvm_vm().fd.clone(); + let guest_mem = _vmm.lock().unwrap().guest_memory().clone(); + let guest_memfd = _vmm.lock().unwrap().guest_memfd_vec.clone(); + + std::thread::spawn(move || loop { + match receiver.recv() { + Err(e) => error!("Error in receiver: {:?}", e), + Ok(m) => { + let _ret = vm + .lock() + .unwrap() + .set_memory_attributes(kvm_memory_attributes { + address: m.addr, + size: m.size, + attributes: m.attributes as u64, + flags: 0, + }); + + // from private to shared + if m.attributes == 0 { + for (index, region) in guest_mem.iter().enumerate() { + // this supposes that m.addr + m.size < region.start + region.size + // which may be false + if (region.start_addr().raw_value() + region.size() as u64) > m.addr { + let offset = m.addr - region.start_addr().raw_value(); + unsafe { + let _ret = fallocate( + *guest_memfd.get(index).unwrap(), + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset as i64, + m.size as i64, + ); + } + } + } + // from shared to private + } else { + for (_index, region) in guest_mem.iter().enumerate() { + if (region.start_addr().raw_value() + region.size() as u64) > m.addr { + let offset = m.addr - region.start_addr().raw_value(); + let host_startaddr = m.addr + offset; + unsafe { + let _ret = madvise( + host_startaddr as *mut c_void, + m.size.try_into().unwrap(), + MADV_DONTNEED, + ); + } + } + } + } + } + } + }); + #[cfg(target_os = "macos")] std::thread::Builder::new() .name("mapping worker".into()) diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 25ed38d7..70d49e98 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -5,8 +5,10 @@ authors = ["Amazon Firecracker team "] edition = "2021" [features] +default = ["cca"] tee = [] amd-sev = [ "blk", "tee", "codicon", "kbs-types", "procfs", "rdrand", "serde", "serde_json", "sev", "curl" ] +cca = [] net = [] blk = [] efi = [ "blk", "net" ] @@ -37,12 +39,14 @@ sev = { version = "4.0.0", features = ["openssl"], optional = true } curl = { version = "0.4", optional = true } nix = "0.24.1" +cca = { git = "https://github.com/virtee/cca" } + [target.'cfg(target_arch = "x86_64")'.dependencies] cpuid = { path = "../cpuid" } [target.'cfg(target_os = "linux")'.dependencies] -kvm-bindings = { version = ">=0.10", features = ["fam-wrappers"] } -kvm-ioctls = ">=0.17" +kvm-bindings = { version = ">=0.8", features = ["fam-wrappers"] , git = "https://github.com/virtee/kvm-bindings", branch = "add_bindings_for_realms" } +kvm-ioctls = { version = ">=0.17", git = "https://github.com/virtee/kvm-ioctls", branch = "cca" } [target.'cfg(target_os = "macos")'.dependencies] hvf = { path = "../hvf" } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 904ed6b4..116cbcaf 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -3,13 +3,17 @@ //! Enables pre-boot setup, instantiation and booting of a Firecracker VMM. +use crate::vstate::MemProperties; +use cca::Algo; #[cfg(target_os = "macos")] use crossbeam_channel::{unbounded, Sender}; +use std::cmp::max; use std::fmt::{Display, Formatter}; use std::fs::File; use std::io; #[cfg(target_os = "linux")] use std::os::fd::AsRawFd; +use std::os::fd::RawFd; use std::path::PathBuf; use std::sync::{Arc, Mutex}; @@ -47,7 +51,7 @@ use crate::vmm_config::boot_source::DEFAULT_KERNEL_CMDLINE; use crate::vmm_config::fs::FsDeviceConfig; #[cfg(target_os = "linux")] use crate::vstate::KvmContext; -#[cfg(all(target_os = "linux", feature = "tee"))] +#[cfg(all(target_os = "linux", any(feature = "tee", feature = "cca")))] use crate::vstate::MeasuredRegion; use crate::vstate::{Error as VstateError, Vcpu, VcpuConfig, Vm}; use arch::ArchMemoryInfo; @@ -57,6 +61,8 @@ use device_manager::shm::ShmManager; #[cfg(not(feature = "tee"))] use devices::virtio::{fs::ExportTable, VirtioShmRegion}; #[cfg(feature = "tee")] +use kbs_types::Tee; +#[cfg(feature = "tee")] use kvm_bindings::KVM_MAX_CPUID_ENTRIES; use libc::{STDERR_FILENO, STDIN_FILENO, STDOUT_FILENO}; use nix::unistd::isatty; @@ -72,6 +78,11 @@ use vm_memory::Bytes; use vm_memory::GuestRegionMmap; use vm_memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; +use vm_memory::GuestMemoryRegion; + +use crossbeam_channel::Sender; +use kvm_bindings::KVM_ARM_VCPU_REC; + #[cfg(feature = "efi")] static EDK2_BINARY: &[u8] = include_bytes!("../../../edk2/KRUN_EFI.silent.fd"); @@ -352,6 +363,7 @@ enum Payload { pub fn build_microvm( vm_resources: &super::resources::VmResources, event_manager: &mut EventManager, + io_sender: Sender, _shutdown_efd: Option, #[cfg(target_os = "macos")] _map_sender: Sender, ) -> std::result::Result>, StartMicrovmError> { @@ -413,9 +425,11 @@ pub fn build_microvm( Some(s) => kernel_cmdline.insert_str(s).unwrap(), }; + let mut guest_memfd: Vec = vec![]; + #[cfg(not(feature = "tee"))] #[allow(unused_mut)] - let mut vm = setup_vm(&guest_memory)?; + let mut vm = setup_vm(&guest_memory, &mut guest_memfd)?; #[cfg(feature = "tee")] let (kvm, mut vm) = { @@ -485,6 +499,45 @@ pub fn build_microvm( m }; + #[cfg(feature = "cca")] + let measured_regions = { + let m = vec![ + MeasuredRegion { + guest_addr: kernel_bundle.guest_addr, + // TODO: remove host_addr? + host_addr: guest_memory + .get_host_address(GuestAddress(kernel_bundle.guest_addr)) + .unwrap() as u64, + size: kernel_bundle.size, + populate: true, + }, + MeasuredRegion { + guest_addr: kernel_bundle.guest_addr + kernel_bundle.size as u64, + host_addr: guest_memory + .get_host_address(GuestAddress( + kernel_bundle.guest_addr + kernel_bundle.size as u64, + )) + .unwrap() as u64, + size: vm_resources.vm_config().mem_size_mib.unwrap() << 20 - kernel_bundle.size, + populate: false, + }, + // The region used for the FDT must be populated. However, we only know the addr and the size after + // configure_system() but at that point guest_memory is already shared. For the moment, hardcore the + // fdt addr and size. + MeasuredRegion { + guest_addr: 0x2DFE00000, + host_addr: guest_memory + .get_host_address(GuestAddress(0x2DFE00000)) + .unwrap() as u64, + // size must be page aligned + size: 0x1000, + populate: true, + }, + ]; + + m + }; + // On x86_64 always create a serial device, // while on aarch64 only create it if 'console=' is specified in the boot args. let serial_device = if cfg!(feature = "efi") { @@ -572,16 +625,19 @@ pub fn build_microvm( &guest_memory, GuestAddress(kernel_bundle.guest_addr), &exit_evt, + io_sender, ) .map_err(StartMicrovmError::Internal)?; setup_interrupt_controller(&mut vm, vcpu_config.vcpu_count)?; + /* + This makes the kernel to block in parsing it, I do not know why attach_legacy_devices( &vm, &mut mmio_device_manager, &mut kernel_cmdline, serial_device, - )?; + )?; */ } #[cfg(all(target_arch = "aarch64", target_os = "macos"))] @@ -622,6 +678,7 @@ pub fn build_microvm( exit_observers: Vec::new(), vm, mmio_device_manager, + guest_memfd_vec: guest_memfd, #[cfg(target_arch = "x86_64")] pio_device_manager, }; @@ -674,7 +731,7 @@ pub fn build_microvm( if let Some(vsock) = vm_resources.vsock.get() { attach_unixsock_vsock_device(&mut vmm, vsock, event_manager, intc.clone())?; #[cfg(not(feature = "net"))] - vmm.kernel_cmdline.insert_str("tsi_hijack")?; + //vmm.kernel_cmdline.insert_str("tsi_hijack")?; #[cfg(feature = "net")] if vm_resources .net_builder @@ -683,7 +740,7 @@ pub fn build_microvm( .is_empty() { // Only enable TSI if we don't have any network devices. - vmm.kernel_cmdline.insert_str("tsi_hijack")?; + //vmm.kernel_cmdline.insert_str("tsi_hijack")?; } } #[cfg(feature = "net")] @@ -747,6 +804,55 @@ pub fn build_microvm( println!("Starting TEE/microVM."); } + // after this point guest memory and regs are not accesible anymore + #[cfg(feature = "cca")] + { + let _ = vmm + .kvm_vm() + .realm + .configure_measurement(&vmm.kvm_vm().fd.lock().unwrap(), Algo::AlgoSha256); + + vmm.kvm_vm() + .realm + .create_realm_descriptor(&vmm.kvm_vm().fd.lock().unwrap()) + .unwrap(); + + println!("Injecting and measuring memory regions. This may take a while."); + + for region in measured_regions.iter() { + if region.populate { + vmm.kvm_vm() + .realm + .populate( + &vmm.kvm_vm().fd.lock().unwrap(), + region.guest_addr, + region.size.try_into().unwrap(), + ) + .unwrap(); + } else { + vmm.kvm_vm() + .realm + .initiate( + &vmm.kvm_vm().fd.lock().unwrap(), + region.guest_addr, + region.size.try_into().unwrap(), + ) + .unwrap(); + } + } + + let feature = KVM_ARM_VCPU_REC as i32; + + for vcpu in vcpus.iter() { + vcpu.fd.vcpu_finalize(&feature).unwrap(); + } + + vmm.kvm_vm() + .realm + .activate(&vmm.kvm_vm().fd.lock().unwrap()) + .unwrap(); + } + vmm.start_vcpus(vcpus) .map_err(StartMicrovmError::Internal)?; @@ -894,7 +1000,7 @@ fn load_cmdline(vmm: &Vmm) -> std::result::Result<(), StartMicrovmError> { .map_err(StartMicrovmError::LoadCommandline) } -#[cfg(all(target_os = "linux", not(feature = "tee")))] +#[cfg(all(target_os = "linux", not(feature = "tee"), not(feature = "cca")))] pub(crate) fn setup_vm( guest_memory: &GuestMemoryMmap, ) -> std::result::Result { @@ -909,6 +1015,28 @@ pub(crate) fn setup_vm( .map_err(StartMicrovmError::Internal)?; Ok(vm) } +#[cfg(all(target_os = "linux", feature = "cca"))] +pub(crate) fn setup_vm( + guest_memory: &GuestMemoryMmap, + guest_memfd: &mut Vec, +) -> std::result::Result { + let kvm: KvmContext = KvmContext::new() + .map_err(Error::KvmContext) + .map_err(StartMicrovmError::Internal)?; + + // calculate max_addr for max_ipa + let mut vm = Vm::new( + kvm.fd(), + (guest_memory.last_addr().raw_value() * 2) as usize, + ) + .map_err(Error::Vm) + .map_err(StartMicrovmError::Internal)?; + + vm.memory_init(guest_memory, kvm.max_memslots(), guest_memfd, true) + .map_err(Error::Vm) + .map_err(StartMicrovmError::Internal)?; + Ok(vm) +} #[cfg(all(target_os = "linux", feature = "tee"))] pub(crate) fn setup_vm( kvm: &KvmContext, @@ -1017,13 +1145,13 @@ fn attach_legacy_devices( ) -> std::result::Result<(), StartMicrovmError> { if let Some(serial) = serial { mmio_device_manager - .register_mmio_serial(vm.fd(), kernel_cmdline, serial) + .register_mmio_serial(&vm.fd.lock().unwrap(), kernel_cmdline, serial) .map_err(Error::RegisterMMIODevice) .map_err(StartMicrovmError::Internal)?; } mmio_device_manager - .register_mmio_rtc(vm.fd()) + .register_mmio_rtc(&vm.fd.lock().unwrap()) .map_err(Error::RegisterMMIODevice) .map_err(StartMicrovmError::Internal)?; @@ -1103,17 +1231,19 @@ fn create_vcpus_aarch64( guest_mem: &GuestMemoryMmap, entry_addr: GuestAddress, exit_evt: &EventFd, + sender_io: Sender, ) -> super::Result> { let mut vcpus = Vec::with_capacity(vcpu_config.vcpu_count as usize); for cpu_index in 0..vcpu_config.vcpu_count { - let mut vcpu = Vcpu::new_aarch64( + let mut vcpu: Vcpu = Vcpu::new_aarch64( cpu_index, - vm.fd(), + &vm.fd.lock().unwrap(), exit_evt.try_clone().map_err(Error::EventFd)?, + sender_io.clone(), ) .map_err(Error::Vcpu)?; - vcpu.configure_aarch64(vm.fd(), guest_mem, entry_addr) + vcpu.configure_aarch64(&vm.fd.lock().unwrap(), guest_mem, entry_addr) .map_err(Error::Vcpu)?; vcpus.push(vcpu); @@ -1175,9 +1305,12 @@ fn attach_mmio_device( let _cmdline = &mut vmm.kernel_cmdline; #[cfg(target_os = "linux")] - let (_mmio_base, _irq) = - vmm.mmio_device_manager - .register_mmio_device(vmm.vm.fd(), device, type_id, id)?; + let (_mmio_base, _irq) = vmm.mmio_device_manager.register_mmio_device( + &vmm.vm.fd.lock().unwrap(), + device, + type_id, + id, + )?; #[cfg(target_os = "macos")] let (_mmio_base, _irq) = vmm .mmio_device_manager diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 0e680a84..22394833 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -39,6 +39,7 @@ use macos::vstate; use std::fmt::{Display, Formatter}; use std::io; +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; use std::sync::{Arc, Mutex}; #[cfg(target_os = "linux")] @@ -188,6 +189,8 @@ pub struct Vmm { guest_memory: GuestMemoryMmap, arch_memory_info: ArchMemoryInfo, + pub guest_memfd_vec: Vec, + kernel_cmdline: KernelCmdline, vcpus_handles: Vec, diff --git a/src/vmm/src/linux/vstate.rs b/src/vmm/src/linux/vstate.rs index f86cdd47..7392c0f2 100644 --- a/src/vmm/src/linux/vstate.rs +++ b/src/vmm/src/linux/vstate.rs @@ -8,13 +8,17 @@ use crossbeam_channel::{unbounded, Receiver, Sender, TryRecvError}; use libc::{c_int, c_void, siginfo_t}; use std::cell::Cell; +use std::cmp::max; use std::fmt::{Display, Formatter}; use std::io; use std::os::fd::RawFd; +use std::sync::Arc; +use std::sync::Mutex; #[cfg(feature = "tee")] use std::os::unix::io::RawFd; +use kvm_ioctls::VcpuExit::Unsupported; use std::result; use std::sync::atomic::{fence, Ordering}; #[cfg(not(test))] @@ -47,8 +51,10 @@ use kvm_bindings::{ KVM_MAX_CPUID_ENTRIES, KVM_PIT_SPEAKER_DUMMY, }; use kvm_bindings::{ - kvm_create_guest_memfd, kvm_userspace_memory_region, kvm_userspace_memory_region2, - KVM_API_VERSION, KVM_MEM_GUEST_MEMFD, + kvm_create_guest_memfd, kvm_memory_attributes, kvm_userspace_memory_region, + kvm_userspace_memory_region2, KVM_API_VERSION, KVM_MEMORY_ATTRIBUTE_PRIVATE, + KVM_MEMORY_EXIT_FLAG_PRIVATE, KVM_MEM_GUEST_MEMFD, KVM_VM_TYPE_ARM_IPA_SIZE_MASK, + KVM_VM_TYPE_ARM_REALM, }; use kvm_ioctls::*; use utils::eventfd::EventFd; @@ -64,6 +70,9 @@ use sev::launch::sev as sev_launch; #[cfg(feature = "amd-sev")] use sev::launch::snp; +#[cfg(feature = "cca")] +use cca::Realm; + /// Signal number (SIGRTMIN) used to kick Vcpus. pub(crate) const VCPU_RTSIG_OFFSET: i32 = 0; @@ -405,12 +414,13 @@ impl Display for Error { pub type Result = result::Result; -#[cfg(feature = "tee")] +#[cfg(any(feature = "tee", feature = "cca"))] #[derive(Debug)] pub struct MeasuredRegion { pub guest_addr: u64, pub host_addr: u64, pub size: usize, + pub populate: bool, } /// Describes a KVM context that gets attached to the microVM. @@ -464,7 +474,7 @@ impl KvmContext { /// A wrapper around creating and using a VM. pub struct Vm { - fd: VmFd, + pub fd: Arc>, next_mem_slot: u32, // X86 specific fields. @@ -486,11 +496,14 @@ pub struct Vm { #[cfg(feature = "amd-sev")] pub tee: Tee, + + #[cfg(feature = "cca")] + pub realm: Realm, } impl Vm { /// Constructs a new `Vm` using the given `Kvm` instance. - #[cfg(not(feature = "tee"))] + #[cfg(all(not(feature = "tee"), not(feature = "cca")))] pub fn new(kvm: &Kvm) -> Result { //create fd for interacting with kvm-vm specific functions let vm_fd = kvm.create_vm().map_err(Error::VmFd)?; @@ -515,6 +528,27 @@ impl Vm { }) } + #[cfg(feature = "cca")] + pub fn new(kvm: &Kvm, max_ipa: usize) -> Result { + //create fd for interacting with kvm-vm specific functions + let ipa_bits = max(64u32 - max_ipa.leading_zeros() - 1, 32) + 1; + let vm_fd = kvm + .create_vm_with_type( + (KVM_VM_TYPE_ARM_REALM | (ipa_bits & KVM_VM_TYPE_ARM_IPA_SIZE_MASK)).into(), + ) + .map_err(Error::VmFd)?; + + let realm = Realm::new(); + + Ok(Vm { + next_mem_slot: 0, + fd: Arc::new(Mutex::new(vm_fd)), + #[cfg(target_arch = "aarch64")] + irqchip_handle: None, + realm, + }) + } + #[cfg(feature = "amd-sev")] pub fn new(kvm: &Kvm, tee_config: &TeeConfig) -> Result { //create fd for interacting with kvm-vm specific functions @@ -564,6 +598,7 @@ impl Vm { &mut self, guest_mem: &GuestMemoryMmap, kvm_max_memslots: usize, + guest_memfd: &mut Vec, require_guest_memfd: bool, ) -> Result<()> { if guest_mem.num_regions() > kvm_max_memslots { @@ -583,9 +618,13 @@ impl Vm { let id: RawFd = self .fd + .lock() + .unwrap() .create_guest_memfd(gmem) .map_err(Error::CreateGuestMemfd)?; + guest_memfd.push(id); + let memory_region = kvm_userspace_memory_region2 { slot: self.next_mem_slot as u32, flags: KVM_MEM_GUEST_MEMFD, @@ -602,9 +641,22 @@ impl Vm { // are not overlapping. unsafe { self.fd + .lock() + .unwrap() .set_user_memory_region2(memory_region) .map_err(Error::SetUserMemoryRegion2)?; }; + + // set private by default when using guestmemfd + // this imitates QEMU behavior + let attr = kvm_memory_attributes { + address: region.start_addr().raw_value(), + size: region.len(), + attributes: KVM_MEMORY_ATTRIBUTE_PRIVATE as u64, + flags: 0, + }; + + self.fd.lock().unwrap().set_memory_attributes(attr).unwrap(); } else { let memory_region = kvm_userspace_memory_region { slot: self.next_mem_slot as u32, @@ -617,6 +669,8 @@ impl Vm { // are not overlapping. unsafe { self.fd + .lock() + .unwrap() .set_user_memory_region(memory_region) .map_err(Error::SetUserMemoryRegion)?; }; @@ -706,7 +760,8 @@ impl Vm { #[cfg(target_arch = "aarch64")] pub fn setup_irqchip(&mut self, vcpu_count: u8) -> Result<()> { self.irqchip_handle = Some( - arch::aarch64::gic::create_gic(&self.fd, vcpu_count.into()).map_err(Error::SetupGIC)?, + arch::aarch64::gic::create_gic(&self.fd.lock().unwrap(), vcpu_count.into()) + .map_err(Error::SetupGIC)?, ); Ok(()) } @@ -719,9 +774,9 @@ impl Vm { } /// Gets a reference to the kvm file descriptor owned by this VM. - pub fn fd(&self) -> &VmFd { - &self.fd - } + //pub fn fd(&self) -> &VmFd { + // &self.fd + // } #[allow(unused)] #[cfg(target_arch = "x86_64")] @@ -812,9 +867,14 @@ pub struct VcpuConfig { // Using this for easier explicit type-casting to help IDEs interpret the code. type VcpuCell = Cell>; +pub struct MemProperties { + pub addr: u64, + pub size: u64, + pub attributes: u32, +} /// A wrapper around creating and using a kvm-based VCPU. pub struct Vcpu { - fd: VcpuFd, + pub fd: VcpuFd, id: u8, mmio_bus: Option, #[allow(dead_code)] @@ -831,6 +891,9 @@ pub struct Vcpu { #[cfg(target_arch = "aarch64")] mpidr: u64, + #[cfg(feature = "cca")] + sender_io: Sender, + // The receiving end of events channel owned by the vcpu side. event_receiver: Receiver, // The transmitting end of the events channel which will be given to the handler. @@ -972,7 +1035,12 @@ impl Vcpu { /// * `exit_evt` - An `EventFd` that will be written into when this vcpu exits. /// * `create_ts` - A timestamp used by the vcpu to calculate its lifetime. #[cfg(target_arch = "aarch64")] - pub fn new_aarch64(id: u8, vm_fd: &VmFd, exit_evt: EventFd) -> Result { + pub fn new_aarch64( + id: u8, + vm_fd: &VmFd, + exit_evt: EventFd, + sender_io: Sender, + ) -> Result { let kvm_vcpu = vm_fd.create_vcpu(id as u64).map_err(Error::VcpuFd)?; let (event_sender, event_receiver) = unbounded(); let (response_sender, response_receiver) = unbounded(); @@ -987,6 +1055,7 @@ impl Vcpu { event_sender: Some(event_sender), response_receiver: Some(response_receiver), response_sender, + sender_io, }) } @@ -1076,6 +1145,11 @@ impl Vcpu { .map_err(Error::VcpuArmPreferredTarget)?; // We already checked that the capability is supported. kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2; + + if cfg!(feature = "cca") { + kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_REC; + } + // Non-boot cpus are powered off initially. if self.id > 0 { kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF; @@ -1273,12 +1347,36 @@ impl Vcpu { info!("Received KVM_EXIT_SHUTDOWN signal"); Ok(VcpuEmulation::Stopped) } + VcpuExit::MemoryFault { flags, gpa, size } => { + if flags & !KVM_MEMORY_EXIT_FLAG_PRIVATE as u64 != 0 { + error!("KVM_EXIT_MEMORY_FAULT: Unknown flag {}", flags); + Err(Error::VcpuUnhandledKvmExit) + } else { + // from private to shared + let mut attr = 0; + // from shared to private + if flags & KVM_MEMORY_EXIT_FLAG_PRIVATE as u64 + == KVM_MEMORY_EXIT_FLAG_PRIVATE as u64 + { + attr = KVM_MEMORY_ATTRIBUTE_PRIVATE; + }; + + let _ = self.sender_io.try_send(MemProperties { + addr: gpa, + size, + attributes: attr, + }); + Ok(VcpuEmulation::Handled) + } + } // Documentation specifies that below kvm exits are considered // errors. VcpuExit::FailEntry(reason, vcpu) => { error!("Received KVM_EXIT_FAIL_ENTRY signal: reason={reason}, vcpu={vcpu}"); Err(Error::VcpuUnhandledKvmExit) } + // TODO: to remove this + Unsupported(39) => Ok(VcpuEmulation::Handled), VcpuExit::InternalError => { error!("Received KVM_EXIT_INTERNAL_ERROR signal"); Err(Error::VcpuUnhandledKvmExit) @@ -1610,7 +1708,9 @@ mod tests { // Create valid memory region and test that the initialization is successful. let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x1000)]).unwrap(); - assert!(vm.memory_init(&gm, kvm_context.max_memslots(), false).is_ok()); + assert!(vm + .memory_init(&gm, kvm_context.max_memslots(), false) + .is_ok()); // Set the maximum number of memory slots to 1 in KvmContext to check the error // path of memory_init. Create 2 non-overlapping memory slots. @@ -1620,7 +1720,9 @@ mod tests { (GuestAddress(0x1001), 0x2000), ]) .unwrap(); - assert!(vm.memory_init(&gm, kvm_context.max_memslots(), false).is_err()); + assert!(vm + .memory_init(&gm, kvm_context.max_memslots(), false) + .is_err()); } #[cfg(target_arch = "x86_64")]