From e56fecca8a8c7de417ba474e4d8ce7e44b311609 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Wed, 9 Apr 2025 17:59:58 -0700 Subject: [PATCH 01/18] Thinking where to add it --- openhcl/openhcl_boot/src/host_params/dma_hint.rs | 8 ++++++++ openhcl/openhcl_boot/src/host_params/dt.rs | 16 +++++++++++++++- openhcl/openhcl_boot/src/host_params/mod.rs | 1 + 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 openhcl/openhcl_boot/src/host_params/dma_hint.rs diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs new file mode 100644 index 0000000000..e06ac05612 --- /dev/null +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -0,0 +1,8 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! Calculate DMA hint value if not provided by host. + +pub fn vtl2_calculate_dma_hint() -> u64 { + 16384 +} diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs index 1ae52e24c0..21c72808b2 100644 --- a/openhcl/openhcl_boot/src/host_params/dt.rs +++ b/openhcl/openhcl_boot/src/host_params/dt.rs @@ -13,6 +13,7 @@ use crate::host_params::MAX_ENTROPY_SIZE; use crate::host_params::MAX_NUMA_NODES; use crate::host_params::MAX_PARTITION_RAM_RANGES; use crate::host_params::MAX_VTL2_USED_RANGES; +use crate::host_params::dma_hint::vtl2_calculate_dma_hint; use crate::single_threaded::OffStackRef; use crate::single_threaded::off_stack; use arrayvec::ArrayVec; @@ -337,6 +338,7 @@ impl PartitionInfo { let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?; let command_line = params.command_line(); + log!("YSP: device_dma_page_count {}", parsed.device_dma_page_count.unwrap_or(0)); // Always write the measured command line. write!( @@ -358,6 +360,7 @@ impl PartitionInfo { match parsed.memory_allocation_mode { MemoryAllocationMode::Host => { + log!("YSP: MemoryAllocationMode::Host"); storage.vtl2_ram.clear(); storage .vtl2_ram @@ -369,6 +372,7 @@ impl PartitionInfo { memory_size, mmio_size, } => { + log!("YSP: MemoryAllocationMode::Vtl2"); storage.vtl2_ram.clear(); storage .vtl2_ram @@ -457,7 +461,7 @@ impl PartitionInfo { // Decide if we will reserve memory for a VTL2 private pool. Parse this // from the final command line, or the host provided device tree value. - let vtl2_gpa_pool_size = { + let mut vtl2_gpa_pool_size = { let dt_page_count = parsed.device_dma_page_count; let cmdline_page_count = crate::cmdline::parse_boot_command_line(storage.cmdline.as_str()) @@ -465,6 +469,15 @@ impl PartitionInfo { max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0)) }; + // If host did not provide the DMA hint value, re-evaluate + // it internally if conditions satisfy. + if vtl2_gpa_pool_size == 0 && parsed.nvme_keepalive { + let dma_hint = vtl2_calculate_dma_hint(); + if dma_hint != 0 { + vtl2_gpa_pool_size = dma_hint; + //*parsed.device_dma_page_count = Some(dma_hint); + } + } if vtl2_gpa_pool_size != 0 { // Reserve the specified number of pages for the pool. Use the used // ranges to figure out which VTL2 memory is free to allocate from. @@ -501,6 +514,7 @@ impl PartitionInfo { storage.vtl2_pool_memory = pool; } + log!("YSP: vtl2_gpa_pool_size {}", vtl2_gpa_pool_size); // If we can trust the host, use the provided alias map if can_trust_host { diff --git a/openhcl/openhcl_boot/src/host_params/mod.rs b/openhcl/openhcl_boot/src/host_params/mod.rs index ea9ac0d422..05b087f011 100644 --- a/openhcl/openhcl_boot/src/host_params/mod.rs +++ b/openhcl/openhcl_boot/src/host_params/mod.rs @@ -15,6 +15,7 @@ use memory_range::MemoryRange; use memory_range::subtract_ranges; use shim_params::IsolationType; +mod dma_hint; mod dt; mod mmio; pub mod shim_params; From 4391f6985e2e63f73f4875c9828e53397558cbe9 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Wed, 9 Apr 2025 18:11:34 -0700 Subject: [PATCH 02/18] Improved syntax --- openhcl/openhcl_boot/src/host_params/dt.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs index 21c72808b2..578cbd072b 100644 --- a/openhcl/openhcl_boot/src/host_params/dt.rs +++ b/openhcl/openhcl_boot/src/host_params/dt.rs @@ -461,23 +461,21 @@ impl PartitionInfo { // Decide if we will reserve memory for a VTL2 private pool. Parse this // from the final command line, or the host provided device tree value. - let mut vtl2_gpa_pool_size = { + let vtl2_gpa_pool_size = { let dt_page_count = parsed.device_dma_page_count; let cmdline_page_count = crate::cmdline::parse_boot_command_line(storage.cmdline.as_str()) .enable_vtl2_gpa_pool; - max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0)) - }; - // If host did not provide the DMA hint value, re-evaluate - // it internally if conditions satisfy. - if vtl2_gpa_pool_size == 0 && parsed.nvme_keepalive { - let dma_hint = vtl2_calculate_dma_hint(); - if dma_hint != 0 { - vtl2_gpa_pool_size = dma_hint; - //*parsed.device_dma_page_count = Some(dma_hint); + let hostval = max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0)); + if hostval == 0 && parsed.nvme_keepalive { + // If host did not provide the DMA hint value, re-evaluate + // it internally if conditions satisfy. + vtl2_calculate_dma_hint() + } else { + hostval } - } + }; if vtl2_gpa_pool_size != 0 { // Reserve the specified number of pages for the pool. Use the used // ranges to figure out which VTL2 memory is free to allocate from. From ac049360bd1caba866f94e9c162a04b1cc02638b Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 10 Apr 2025 21:52:57 +0000 Subject: [PATCH 03/18] More experiments --- openhcl/openhcl_boot/src/dt.rs | 2 ++ openhcl/openhcl_boot/src/host_params/dma_hint.rs | 14 +++++++++++++- openhcl/openhcl_boot/src/host_params/dt.rs | 7 +++++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/openhcl/openhcl_boot/src/dt.rs b/openhcl/openhcl_boot/src/dt.rs index 3a85058c9c..15393f0e70 100644 --- a/openhcl/openhcl_boot/src/dt.rs +++ b/openhcl/openhcl_boot/src/dt.rs @@ -9,6 +9,7 @@ use crate::ReservedMemoryType; use crate::host_params::COMMAND_LINE_SIZE; use crate::host_params::PartitionInfo; use crate::host_params::shim_params::IsolationType; +use crate::log; // YSP use crate::sidecar::SidecarConfig; use crate::single_threaded::off_stack; use arrayvec::ArrayString; @@ -211,6 +212,7 @@ pub fn write_dt( let p_enable_method = builder.add_string("enable-method")?; let num_cpus = partition_info.cpus.len(); + log!("YSP: num_cpus2 = {}", num_cpus); let mut root_builder = builder .start_node("")? diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index e06ac05612..8c74a32ddc 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -3,6 +3,18 @@ //! Calculate DMA hint value if not provided by host. -pub fn vtl2_calculate_dma_hint() -> u64 { +use crate::log; +use igvm_defs::MemoryMapEntryType; +use super::PartitionInfo; + +pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 { + log!("YSP: vp_count = {}", vp_count); + let mem_size = storage + .vtl2_ram + .iter() + .filter(|m| m.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE) + .map(|e| e.range.len()) + .sum::(); + log!("YSP: mem_size = {}", mem_size); 16384 } diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs index 578cbd072b..886f50f307 100644 --- a/openhcl/openhcl_boot/src/host_params/dt.rs +++ b/openhcl/openhcl_boot/src/host_params/dt.rs @@ -468,10 +468,13 @@ impl PartitionInfo { .enable_vtl2_gpa_pool; let hostval = max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0)); - if hostval == 0 && parsed.nvme_keepalive { + if hostval == 0 && + parsed.nvme_keepalive && + params.isolation_type == IsolationType::None && + storage.memory_allocation_mode == MemoryAllocationMode::Host { // If host did not provide the DMA hint value, re-evaluate // it internally if conditions satisfy. - vtl2_calculate_dma_hint() + vtl2_calculate_dma_hint(parsed.cpu_count(), storage) } else { hostval } From b45f919b4079bac5c2f3f724bdbd7848e0980988 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 10 Apr 2025 23:01:20 +0000 Subject: [PATCH 04/18] Added table lookup (incomplete table yet) --- .../openhcl_boot/src/host_params/dma_hint.rs | 75 ++++++++++++++++++- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index 8c74a32ddc..410cbf8772 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -7,7 +7,61 @@ use crate::log; use igvm_defs::MemoryMapEntryType; use super::PartitionInfo; +struct DmaLookupStruct { + /// Logical processors for VM. + vp_count: u32, + /// Vtl2AddressRangeSize - Vtl2MmioAddressRangeSize. + vtl2_memory_mb: u32, + /// DMA hint in MiB. + dma_hint_mb: u32, +} + +/// Lookup table for DMA hint calculation. +const LookupTable: &'static [DmaLookupStruct] = &[ + DmaLookupStruct { + vp_count: 2, + vtl2_memory_mb: 98, + dma_hint_mb: 4, + }, + DmaLookupStruct { + vp_count: 4, + vtl2_memory_mb: 110, + dma_hint_mb: 6, + }, + DmaLookupStruct { + vp_count: 8, + vtl2_memory_mb: 148, + dma_hint_mb: 10, + }, + DmaLookupStruct { + vp_count: 16, + vtl2_memory_mb: 256, + dma_hint_mb: 18, + }, + DmaLookupStruct { + vp_count: 32, + vtl2_memory_mb: 516, + dma_hint_mb: 36, + }, + DmaLookupStruct { + vp_count: 48, + vtl2_memory_mb: 718, + dma_hint_mb: 52, + }, + DmaLookupStruct { + vp_count: 64, + vtl2_memory_mb: 924, + dma_hint_mb: 68, + }, + DmaLookupStruct { + vp_count: 96, + vtl2_memory_mb: 1340, + dma_hint_mb: 102, + }, +]; + pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 { + let mut dma_hint = 0; log!("YSP: vp_count = {}", vp_count); let mem_size = storage .vtl2_ram @@ -15,6 +69,23 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 .filter(|m| m.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE) .map(|e| e.range.len()) .sum::(); - log!("YSP: mem_size = {}", mem_size); - 16384 + // Sanity check for the calculated memory size. + if mem_size > 0 && mem_size < 0xFFFFFFFF00000 { + let mem_size_mb = (mem_size / 1048576) as u32; + log!("YSP: mem_size_mb = {}", mem_size_mb); + + LookupTable + .iter() + .filter(|e| e.vp_count == vp_count as u32) + .for_each(|f| { + if f.vtl2_memory_mb == mem_size_mb { + // Found exact match. + dma_hint = f.dma_hint_mb as u64; + } else { + // Try to extrapolate based on similar values. + } + }); + } + + dma_hint } From 3d744e2f9abf38cf279e2f474f186950354941c8 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 10 Apr 2025 23:32:52 +0000 Subject: [PATCH 05/18] Minor styling --- openhcl/openhcl_boot/src/host_params/dma_hint.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index 410cbf8772..cb4c9df1ef 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -4,7 +4,7 @@ //! Calculate DMA hint value if not provided by host. use crate::log; -use igvm_defs::MemoryMapEntryType; +use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K}; use super::PartitionInfo; struct DmaLookupStruct { @@ -17,7 +17,7 @@ struct DmaLookupStruct { } /// Lookup table for DMA hint calculation. -const LookupTable: &'static [DmaLookupStruct] = &[ +const LOOKUP_TABLE: &'static [DmaLookupStruct] = &[ DmaLookupStruct { vp_count: 2, vtl2_memory_mb: 98, @@ -60,8 +60,9 @@ const LookupTable: &'static [DmaLookupStruct] = &[ }, ]; +/// Returns calculated DMA hint value, in 4k pages. pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 { - let mut dma_hint = 0; + let mut dma_hint_4k = 0; log!("YSP: vp_count = {}", vp_count); let mem_size = storage .vtl2_ram @@ -74,18 +75,18 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 let mem_size_mb = (mem_size / 1048576) as u32; log!("YSP: mem_size_mb = {}", mem_size_mb); - LookupTable + LOOKUP_TABLE .iter() .filter(|e| e.vp_count == vp_count as u32) .for_each(|f| { if f.vtl2_memory_mb == mem_size_mb { // Found exact match. - dma_hint = f.dma_hint_mb as u64; + dma_hint_4k = f.dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K; } else { // Try to extrapolate based on similar values. } }); } - dma_hint + dma_hint_4k } From 5670edfe2b1b1dcd39cec780b10b47c64d9f17fc Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Tue, 15 Apr 2025 19:11:17 +0000 Subject: [PATCH 06/18] Add unit test --- .../openhcl_boot/src/host_params/dma_hint.rs | 80 +++++++++++++++++-- 1 file changed, 74 insertions(+), 6 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index cb4c9df1ef..2b82707017 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -75,18 +75,86 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 let mem_size_mb = (mem_size / 1048576) as u32; log!("YSP: mem_size_mb = {}", mem_size_mb); - LOOKUP_TABLE - .iter() - .filter(|e| e.vp_count == vp_count as u32) - .for_each(|f| { + let mut min_vtl2_memory_mb = 1000000; + let mut max_vtl2_memory_mb = 0; + // TODO: If we won't allow floats in boot-shim, replace with scaled integers + let mut min_ratio: f32 = 0.1; + let mut max_ratio: f32 = 0.01; + + let mut min_vp_count = 1; + let mut max_vp_count = 4096; + + for f in LOOKUP_TABLE { + if f.vp_count == vp_count as u32 { if f.vtl2_memory_mb == mem_size_mb { // Found exact match. dma_hint_4k = f.dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K; + break; } else { - // Try to extrapolate based on similar values. + // Prepare for possible extrapolation. + min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb); + max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb); + min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); + max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); } - }); + } else if f.vp_count < vp_count as u32 { + // Find the nearest VP counts if exact match is not in the table. + min_vp_count = min_vp_count.max(f.vp_count); + } else if f.vp_count > vp_count as u32 { + max_vp_count = max_vp_count.min(f.vp_count); + } + } + + // It is possible there were no matching entries in the lookup table. + // (i.e. unexpected VP count). + if max_vtl2_memory_mb == 0 { + LOOKUP_TABLE + .iter() + .filter(|e| e.vp_count == min_vp_count || e.vp_count == max_vp_count) + .for_each(|f| { + // Prepare for possible extrapolation. + min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb); + max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb); + min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); + max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); + }); + } + + if dma_hint_4k == 0 { + // If we didn't find an exact match for our vp_count, try to extrapolate. + dma_hint_4k = (mem_size_mb as f32 * ((min_ratio + max_ratio) / 2.0)) as u64 * + 1048576 / + PAGE_SIZE_4K; + } } dma_hint_4k } + +#[cfg(test)] +mod test { + use super::*; + use crate::host_params::MemoryEntry; + use crate::MemoryRange; + + #[test] + fn test_vtl2_calculate_dma_hint() { + let mut storage = PartitionInfo::new(); + + storage.vtl2_ram.clear(); + storage.vtl2_ram.push(MemoryEntry { + range: MemoryRange::new(0x0..0x6200000), + mem_type: MemoryMapEntryType::VTL2_PROTECTABLE, + vnode: 0, + }); + assert_eq!(vtl2_calculate_dma_hint(2, &storage), 1024); + + storage.vtl2_ram.clear(); + storage.vtl2_ram.push(MemoryEntry { + range: MemoryRange::new(0x0..0x6E00000), + mem_type: MemoryMapEntryType::VTL2_PROTECTABLE, + vnode: 0, + }); + assert_eq!(vtl2_calculate_dma_hint(4, &storage), 1536); + } +} From 536e7bc46b3351e2d0b506f3fb29f2df01afeebf Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Tue, 15 Apr 2025 19:38:04 +0000 Subject: [PATCH 07/18] More tests --- openhcl/openhcl_boot/src/host_params/dma_hint.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index 2b82707017..55ad0aef77 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -156,5 +156,15 @@ mod test { vnode: 0, }); assert_eq!(vtl2_calculate_dma_hint(4, &storage), 1536); + + // Test VP count higher than max from LOOKUP_TABLE. + storage.vtl2_ram.clear(); + storage.vtl2_ram.push(MemoryEntry { + range: MemoryRange::new(0x0..0x7000000), + mem_type: MemoryMapEntryType::VTL2_PROTECTABLE, + vnode: 0, + }); + // TODO: Unfinished, maybe the return value is incorrect. + assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2048); } } From 936d26265f698c7427d0915b7d9d4a59ca77c882 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 17 Apr 2025 11:07:52 -0700 Subject: [PATCH 08/18] Replace float point with integers --- Cargo.lock | 1 + openhcl/openhcl_boot/Cargo.toml | 3 + openhcl/openhcl_boot/src/dt.rs | 2 - .../openhcl_boot/src/host_params/dma_hint.rs | 57 +++++++++++++------ openhcl/openhcl_boot/src/host_params/dt.rs | 4 -- 5 files changed, 43 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dda653c599..13cbe2d9d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4786,6 +4786,7 @@ dependencies = [ "sha2", "sidecar_defs", "tdcall", + "test_with_tracing", "underhill_confidentiality", "x86defs", "zerocopy 0.8.24", diff --git a/openhcl/openhcl_boot/Cargo.toml b/openhcl/openhcl_boot/Cargo.toml index f8d8a70e0a..0e9cbe6b14 100644 --- a/openhcl/openhcl_boot/Cargo.toml +++ b/openhcl/openhcl_boot/Cargo.toml @@ -37,3 +37,6 @@ minimal_rt_build.workspace = true [lints] workspace = true + +[dev-dependencies] +test_with_tracing.workspace = true diff --git a/openhcl/openhcl_boot/src/dt.rs b/openhcl/openhcl_boot/src/dt.rs index 15393f0e70..3a85058c9c 100644 --- a/openhcl/openhcl_boot/src/dt.rs +++ b/openhcl/openhcl_boot/src/dt.rs @@ -9,7 +9,6 @@ use crate::ReservedMemoryType; use crate::host_params::COMMAND_LINE_SIZE; use crate::host_params::PartitionInfo; use crate::host_params::shim_params::IsolationType; -use crate::log; // YSP use crate::sidecar::SidecarConfig; use crate::single_threaded::off_stack; use arrayvec::ArrayString; @@ -212,7 +211,6 @@ pub fn write_dt( let p_enable_method = builder.add_string("enable-method")?; let num_cpus = partition_info.cpus.len(); - log!("YSP: num_cpus2 = {}", num_cpus); let mut root_builder = builder .start_node("")? diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index 55ad0aef77..1f7ccfd46c 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -3,7 +3,6 @@ //! Calculate DMA hint value if not provided by host. -use crate::log; use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K}; use super::PartitionInfo; @@ -60,10 +59,14 @@ const LOOKUP_TABLE: &'static [DmaLookupStruct] = &[ }, ]; +/// Round up to next 2MiB. +fn round_up_to_2mb(pages_4k: u64) -> u64 { + (pages_4k + 511) & !(511) +} + /// Returns calculated DMA hint value, in 4k pages. pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 { let mut dma_hint_4k = 0; - log!("YSP: vp_count = {}", vp_count); let mem_size = storage .vtl2_ram .iter() @@ -73,16 +76,16 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 // Sanity check for the calculated memory size. if mem_size > 0 && mem_size < 0xFFFFFFFF00000 { let mem_size_mb = (mem_size / 1048576) as u32; - log!("YSP: mem_size_mb = {}", mem_size_mb); let mut min_vtl2_memory_mb = 1000000; let mut max_vtl2_memory_mb = 0; - // TODO: If we won't allow floats in boot-shim, replace with scaled integers - let mut min_ratio: f32 = 0.1; - let mut max_ratio: f32 = 0.01; + + // To avoid using floats, scale ratios to 1:1000. + let mut min_ratio_1000th = 100000; + let mut max_ratio_1000th = 1000; let mut min_vp_count = 1; - let mut max_vp_count = 4096; + let mut max_vp_count = vp_count as u32; for f in LOOKUP_TABLE { if f.vp_count == vp_count as u32 { @@ -94,8 +97,8 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 // Prepare for possible extrapolation. min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb); max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb); - min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); - max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); + min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); + max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); } } else if f.vp_count < vp_count as u32 { // Find the nearest VP counts if exact match is not in the table. @@ -112,19 +115,20 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 .iter() .filter(|e| e.vp_count == min_vp_count || e.vp_count == max_vp_count) .for_each(|f| { - // Prepare for possible extrapolation. min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb); max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb); - min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); - max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32); + min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); + max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); }); } - + if dma_hint_4k == 0 { // If we didn't find an exact match for our vp_count, try to extrapolate. - dma_hint_4k = (mem_size_mb as f32 * ((min_ratio + max_ratio) / 2.0)) as u64 * - 1048576 / - PAGE_SIZE_4K; + dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K)) / + ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64); + + // And then round up to 2MiB. + dma_hint_4k = round_up_to_2mb(dma_hint_4k); } } @@ -136,6 +140,7 @@ mod test { use super::*; use crate::host_params::MemoryEntry; use crate::MemoryRange; + use test_with_tracing::test; #[test] fn test_vtl2_calculate_dma_hint() { @@ -164,7 +169,23 @@ mod test { mem_type: MemoryMapEntryType::VTL2_PROTECTABLE, vnode: 0, }); - // TODO: Unfinished, maybe the return value is incorrect. - assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2048); + assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2560); + + // Test unusual VP count. + storage.vtl2_ram.clear(); + storage.vtl2_ram.push(MemoryEntry { + range: MemoryRange::new(0x0..0x6000000), + mem_type: MemoryMapEntryType::VTL2_PROTECTABLE, + vnode: 0, + }); + assert_eq!(vtl2_calculate_dma_hint(52, &storage), 2048); + + storage.vtl2_ram.clear(); + storage.vtl2_ram.push(MemoryEntry { + range: MemoryRange::new(0x0..0x8000000), + mem_type: MemoryMapEntryType::VTL2_PROTECTABLE, + vnode: 0, + }); + assert_eq!(vtl2_calculate_dma_hint(52, &storage), 2560); } } diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs index 886f50f307..5fb0d1870d 100644 --- a/openhcl/openhcl_boot/src/host_params/dt.rs +++ b/openhcl/openhcl_boot/src/host_params/dt.rs @@ -338,7 +338,6 @@ impl PartitionInfo { let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?; let command_line = params.command_line(); - log!("YSP: device_dma_page_count {}", parsed.device_dma_page_count.unwrap_or(0)); // Always write the measured command line. write!( @@ -360,7 +359,6 @@ impl PartitionInfo { match parsed.memory_allocation_mode { MemoryAllocationMode::Host => { - log!("YSP: MemoryAllocationMode::Host"); storage.vtl2_ram.clear(); storage .vtl2_ram @@ -372,7 +370,6 @@ impl PartitionInfo { memory_size, mmio_size, } => { - log!("YSP: MemoryAllocationMode::Vtl2"); storage.vtl2_ram.clear(); storage .vtl2_ram @@ -515,7 +512,6 @@ impl PartitionInfo { storage.vtl2_pool_memory = pool; } - log!("YSP: vtl2_gpa_pool_size {}", vtl2_gpa_pool_size); // If we can trust the host, use the provided alias map if can_trust_host { From 3b37398af33e9021e4c2ee497665d814bb02ed9d Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 17 Apr 2025 15:48:21 -0700 Subject: [PATCH 09/18] Make it more compact --- .../openhcl_boot/src/host_params/dma_hint.rs | 126 +++++++----------- 1 file changed, 48 insertions(+), 78 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index 1f7ccfd46c..a62a4e0b4f 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -6,57 +6,21 @@ use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K}; use super::PartitionInfo; -struct DmaLookupStruct { - /// Logical processors for VM. - vp_count: u32, - /// Vtl2AddressRangeSize - Vtl2MmioAddressRangeSize. - vtl2_memory_mb: u32, - /// DMA hint in MiB. - dma_hint_mb: u32, -} - /// Lookup table for DMA hint calculation. -const LOOKUP_TABLE: &'static [DmaLookupStruct] = &[ - DmaLookupStruct { - vp_count: 2, - vtl2_memory_mb: 98, - dma_hint_mb: 4, - }, - DmaLookupStruct { - vp_count: 4, - vtl2_memory_mb: 110, - dma_hint_mb: 6, - }, - DmaLookupStruct { - vp_count: 8, - vtl2_memory_mb: 148, - dma_hint_mb: 10, - }, - DmaLookupStruct { - vp_count: 16, - vtl2_memory_mb: 256, - dma_hint_mb: 18, - }, - DmaLookupStruct { - vp_count: 32, - vtl2_memory_mb: 516, - dma_hint_mb: 36, - }, - DmaLookupStruct { - vp_count: 48, - vtl2_memory_mb: 718, - dma_hint_mb: 52, - }, - DmaLookupStruct { - vp_count: 64, - vtl2_memory_mb: 924, - dma_hint_mb: 68, - }, - DmaLookupStruct { - vp_count: 96, - vtl2_memory_mb: 1340, - dma_hint_mb: 102, - }, +/// Using tuples instead of structs to keep it readable. +/// Let's keep the table sorted by VP count, then by assigned memory. +/// Using u16 to keep the memory req short. +/// Max VTL2 memory known today is 24838 MiB. +/// (vp_count, vtl2_memory_mb, dma_hint_mb) +const LOOKUP_TABLE: &[(u16, u16, u16)] = &[ + (2, 98, 4), + (4, 110, 6), + (8, 148, 10), + (16, 256, 18), + (32, 516, 36), + (48, 718, 52), + (64, 924, 68), + (96, 1340, 102), ]; /// Round up to next 2MiB. @@ -77,34 +41,40 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 if mem_size > 0 && mem_size < 0xFFFFFFFF00000 { let mem_size_mb = (mem_size / 1048576) as u32; - let mut min_vtl2_memory_mb = 1000000; + let mut min_vtl2_memory_mb = 65535; let mut max_vtl2_memory_mb = 0; // To avoid using floats, scale ratios to 1:1000. let mut min_ratio_1000th = 100000; let mut max_ratio_1000th = 1000; - let mut min_vp_count = 1; - let mut max_vp_count = vp_count as u32; - - for f in LOOKUP_TABLE { - if f.vp_count == vp_count as u32 { - if f.vtl2_memory_mb == mem_size_mb { - // Found exact match. - dma_hint_4k = f.dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K; - break; - } else { - // Prepare for possible extrapolation. - min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb); - max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb); - min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); - max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); + let mut min_vp_count: u16 = 1; + let mut max_vp_count = vp_count as u16; + + for (vp_lookup, vtl2_memory_mb, dma_hint_mb) in LOOKUP_TABLE { + match (*vp_lookup).cmp(&(vp_count as u16)) { + core::cmp::Ordering::Less => { + // Find nearest. + min_vp_count = min_vp_count.max(*vp_lookup); + } + core::cmp::Ordering::Equal => { + if *vtl2_memory_mb == mem_size_mb as u16 { + // Found exact match. + dma_hint_4k = *dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K; + max_vtl2_memory_mb = *vtl2_memory_mb; + break; + } else { + // Prepare for possible extrapolation. + min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb); + max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb); + min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); + max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); + } + } + core::cmp::Ordering::Greater => { + // Find nearest. + max_vp_count = max_vp_count.min(*vp_lookup); } - } else if f.vp_count < vp_count as u32 { - // Find the nearest VP counts if exact match is not in the table. - min_vp_count = min_vp_count.max(f.vp_count); - } else if f.vp_count > vp_count as u32 { - max_vp_count = max_vp_count.min(f.vp_count); } } @@ -113,17 +83,17 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 if max_vtl2_memory_mb == 0 { LOOKUP_TABLE .iter() - .filter(|e| e.vp_count == min_vp_count || e.vp_count == max_vp_count) - .for_each(|f| { - min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb); - max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb); - min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); - max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32); + .filter(|(vp_lookup, _, _)| *vp_lookup == min_vp_count || *vp_lookup == max_vp_count) + .for_each(|(_, vtl2_memory_mb, dma_hint_mb)| { + min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb); + max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb); + min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); + max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); }); } if dma_hint_4k == 0 { - // If we didn't find an exact match for our vp_count, try to extrapolate. + // Didn't find an exact match for vp_count, try to extrapolate. dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K)) / ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64); From 989c2c962ea6eea8af1726619c2a59cd5c514868 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Fri, 18 Apr 2025 00:42:44 +0000 Subject: [PATCH 10/18] Expand the table --- .../openhcl_boot/src/host_params/dma_hint.rs | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index a62a4e0b4f..7ffd62f569 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -13,14 +13,44 @@ use super::PartitionInfo; /// Max VTL2 memory known today is 24838 MiB. /// (vp_count, vtl2_memory_mb, dma_hint_mb) const LOOKUP_TABLE: &[(u16, u16, u16)] = &[ + (2, 96, 2), (2, 98, 4), + (2, 100, 4), + (2, 104, 4), + (4, 108, 2), (4, 110, 6), + (4, 112, 6), + (4, 118, 8), + (4, 130, 12), + (8, 140, 4), (8, 148, 10), - (16, 256, 18), + (8, 170, 20), + (8, 176, 20), + (16, 234, 12), + (16, 256, 18), // There is another configuration with '20'. + (16, 268, 38), + (16, 282, 54), + (24, 420, 66), + (32, 404, 22), (32, 516, 36), + (32, 538, 74), // There is another configuration with '52'. + (48, 558, 32), (48, 718, 52), + (48, 730, 52), + (48, 746, 78), + (64, 712, 42), (64, 924, 68), + (64, 938, 68), + (96, 1030, 64), + (96, 1042, 114), // Can be '64'. + (96, 1058, 114), // Can be '106'. (96, 1340, 102), + (96, 1358, 104), + (96, 1382, 120), + (112, 1566, 288), + (128, 1342, 84), + (128, 1360, 84), + // (896, 12912, 516), // Needs to be validated as the vNIC number is unknown. ]; /// Round up to next 2MiB. @@ -139,7 +169,7 @@ mod test { mem_type: MemoryMapEntryType::VTL2_PROTECTABLE, vnode: 0, }); - assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2560); + assert_eq!(vtl2_calculate_dma_hint(112, &storage), 5632); // Test unusual VP count. storage.vtl2_ram.clear(); From 3ad70408564a5d6649c02c6480ce733bdbdc06e7 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Wed, 23 Apr 2025 23:48:57 +0000 Subject: [PATCH 11/18] Some progress to propagate fallback flag uphill but not quite done yet --- .../openhcl_boot/src/host_params/dma_hint.rs | 38 +++++---- openhcl/openhcl_boot/src/host_params/dt.rs | 9 ++- openhcl/openhcl_dma_manager/src/lib.rs | 2 +- openhcl/underhill_core/src/dispatch/mod.rs | 11 ++- openhcl/underhill_core/src/emuplat/netvsp.rs | 2 +- openhcl/underhill_core/src/lib.rs | 2 +- openhcl/underhill_core/src/nvme_manager.rs | 78 +++++++++++++++---- openhcl/underhill_core/src/options.rs | 6 +- openhcl/underhill_core/src/worker.rs | 11 ++- .../nvme_driver/fuzz/fuzz_emulated_device.rs | 5 ++ .../disk_nvme/nvme_driver/src/driver.rs | 20 +++-- .../disk_nvme/nvme_driver/src/queue_pair.rs | 38 +++++++-- .../disk_nvme/nvme_driver/src/tests.rs | 5 ++ vm/devices/user_driver/src/lib.rs | 3 + vm/devices/user_driver/src/vfio.rs | 21 ++++- .../user_driver_emulated_mock/src/lib.rs | 5 ++ 16 files changed, 195 insertions(+), 61 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index 7ffd62f569..e0925a74e9 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -3,8 +3,8 @@ //! Calculate DMA hint value if not provided by host. -use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K}; use super::PartitionInfo; +use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K}; /// Lookup table for DMA hint calculation. /// Using tuples instead of structs to keep it readable. @@ -97,8 +97,10 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 // Prepare for possible extrapolation. min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb); max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb); - min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); - max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); + min_ratio_1000th = min_ratio_1000th + .min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); + max_ratio_1000th = max_ratio_1000th + .max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); } } core::cmp::Ordering::Greater => { @@ -112,20 +114,24 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 // (i.e. unexpected VP count). if max_vtl2_memory_mb == 0 { LOOKUP_TABLE - .iter() - .filter(|(vp_lookup, _, _)| *vp_lookup == min_vp_count || *vp_lookup == max_vp_count) - .for_each(|(_, vtl2_memory_mb, dma_hint_mb)| { - min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb); - max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb); - min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); - max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); - }); + .iter() + .filter(|(vp_lookup, _, _)| { + *vp_lookup == min_vp_count || *vp_lookup == max_vp_count + }) + .for_each(|(_, vtl2_memory_mb, dma_hint_mb)| { + min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb); + max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb); + min_ratio_1000th = + min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); + max_ratio_1000th = + max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32); + }); } if dma_hint_4k == 0 { // Didn't find an exact match for vp_count, try to extrapolate. - dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K)) / - ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64); + dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K)) + / ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64); // And then round up to 2MiB. dma_hint_4k = round_up_to_2mb(dma_hint_4k); @@ -138,14 +144,14 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 #[cfg(test)] mod test { use super::*; - use crate::host_params::MemoryEntry; use crate::MemoryRange; + use crate::host_params::MemoryEntry; use test_with_tracing::test; #[test] fn test_vtl2_calculate_dma_hint() { let mut storage = PartitionInfo::new(); - + storage.vtl2_ram.clear(); storage.vtl2_ram.push(MemoryEntry { range: MemoryRange::new(0x0..0x6200000), @@ -153,7 +159,7 @@ mod test { vnode: 0, }); assert_eq!(vtl2_calculate_dma_hint(2, &storage), 1024); - + storage.vtl2_ram.clear(); storage.vtl2_ram.push(MemoryEntry { range: MemoryRange::new(0x0..0x6E00000), diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs index 5fb0d1870d..fbadf47feb 100644 --- a/openhcl/openhcl_boot/src/host_params/dt.rs +++ b/openhcl/openhcl_boot/src/host_params/dt.rs @@ -465,10 +465,11 @@ impl PartitionInfo { .enable_vtl2_gpa_pool; let hostval = max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0)); - if hostval == 0 && - parsed.nvme_keepalive && - params.isolation_type == IsolationType::None && - storage.memory_allocation_mode == MemoryAllocationMode::Host { + if hostval == 0 + && parsed.nvme_keepalive + && params.isolation_type == IsolationType::None + && storage.memory_allocation_mode == MemoryAllocationMode::Host + { // If host did not provide the DMA hint value, re-evaluate // it internally if conditions satisfy. vtl2_calculate_dma_hint(parsed.cpu_count(), storage) diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs index 42e79d7297..64db4d2b0a 100644 --- a/openhcl/openhcl_dma_manager/src/lib.rs +++ b/openhcl/openhcl_dma_manager/src/lib.rs @@ -122,7 +122,7 @@ pub struct OpenhclDmaManager { } /// The required VTL permissions on DMA allocations. -#[derive(Inspect)] +#[derive(Clone, Inspect)] pub enum LowerVtlPermissionPolicy { /// No specific permission constraints are required. Any, diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index bf3c460597..04db01ad34 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -7,6 +7,7 @@ mod pci_shutdown; pub mod vtl2_settings_worker; use self::vtl2_settings_worker::DeviceInterfaces; +use crate::nvme_manager; use crate::ControlRequest; use crate::emuplat::EmuplatServicing; use crate::emuplat::netvsp::RuntimeSavedState; @@ -180,7 +181,7 @@ pub(crate) struct LoadedVm { pub _periodic_telemetry_task: Task<()>, - pub nvme_keep_alive: bool, + pub nvme_keepalive: bool, pub test_configuration: Option, pub dma_manager: OpenhclDmaManager, } @@ -494,7 +495,13 @@ impl LoadedVm { // NOTE: This is set via the corresponding env arg, as this feature is // experimental. - let nvme_keepalive = self.nvme_keep_alive && capabilities_flags.enable_nvme_keepalive(); + let nvme_keepalive_runtime_enabled = if let Some(nvme_manager) = self.nvme_manager.as_ref() { + nvme_manager.is_keepalive_still_enabled().await + } else { + false + }; + let nvme_keepalive = self.nvme_keepalive && capabilities_flags.enable_nvme_keepalive() && nvme_keepalive_runtime_enabled; + tracing::info!("YSP: {nvme_keepalive_runtime_enabled} {nvme_keepalive}"); // Do everything before the log flush under a span. let r = async { diff --git a/openhcl/underhill_core/src/emuplat/netvsp.rs b/openhcl/underhill_core/src/emuplat/netvsp.rs index bf517208fb..9398e0fe21 100644 --- a/openhcl/underhill_core/src/emuplat/netvsp.rs +++ b/openhcl/underhill_core/src/emuplat/netvsp.rs @@ -120,7 +120,7 @@ async fn try_create_mana_device( max_sub_channels: u16, dma_client: Arc, ) -> anyhow::Result> { - let device = VfioDevice::new(driver_source, pci_id, dma_client) + let device = VfioDevice::new(driver_source, pci_id, dma_client, None) .await .context("failed to open device")?; diff --git a/openhcl/underhill_core/src/lib.rs b/openhcl/underhill_core/src/lib.rs index 41038dbd07..5f101eebb9 100644 --- a/openhcl/underhill_core/src/lib.rs +++ b/openhcl/underhill_core/src/lib.rs @@ -318,7 +318,7 @@ async fn launch_workers( no_sidecar_hotplug: opt.no_sidecar_hotplug, gdbstub: opt.gdbstub, hide_isolation: opt.hide_isolation, - nvme_keep_alive: opt.nvme_keep_alive, + nvme_keepalive: opt.nvme_keepalive, test_configuration: opt.test_configuration, disable_uefi_frontpage: opt.disable_uefi_frontpage, }; diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs index 2580996140..6bea0f422f 100644 --- a/openhcl/underhill_core/src/nvme_manager.rs +++ b/openhcl/underhill_core/src/nvme_manager.rs @@ -26,8 +26,10 @@ use pal_async::task::Spawn; use pal_async::task::Task; use std::collections::HashMap; use std::collections::hash_map; +use std::sync::Arc; use thiserror::Error; use tracing::Instrument; +use user_driver::DmaClient; use user_driver::vfio::VfioDevice; use vm_resource::AsyncResolveResource; use vm_resource::ResourceId; @@ -126,6 +128,15 @@ impl NvmeManager { &self.client } + /// Save could have been disabled if fallback allocator was used. + pub async fn is_keepalive_still_enabled(&self) -> bool { + let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await { + Ok(s) => s, + Err(_) => false, + }; + self.save_restore_supported && worker_save_restore + } + pub async fn shutdown(self, nvme_keepalive: bool) { // Early return is faster way to skip shutdown. // but we need to thoroughly test the data integrity. @@ -175,6 +186,7 @@ enum Request { span: tracing::Span, nvme_keepalive: bool, }, + KeepAliveStatus(Rpc<(), bool>), } #[derive(Debug, Clone)] @@ -267,6 +279,9 @@ impl NvmeManagerWorker { } break (span, nvme_keepalive); } + Request::KeepAliveStatus(rpc) => { + rpc.complete(self.save_restore_supported); + } } }; @@ -295,24 +310,52 @@ impl NvmeManagerWorker { let driver = match self.devices.entry(pci_id.to_owned()) { hash_map::Entry::Occupied(entry) => entry.into_mut(), hash_map::Entry::Vacant(entry) => { + let device_name = format!("nvme_{}", pci_id); + let allocation_visibility = if self.is_isolated { + AllocationVisibility::Shared + } else { + AllocationVisibility::Private + }; + let lower_vtl_policy = LowerVtlPermissionPolicy::Any; + + // Persistent allocations use fixed size memory. + // Create a fallback allocator which uses heap. + // When fallback allocator is involved, nvme_keepalive + // will be implicitly disabled. + let fallback_dma_client = if self.save_restore_supported { + let client: Arc = self + .dma_client_spawner + .new_client(DmaClientParameters { + device_name: device_name.clone(), + lower_vtl_policy: lower_vtl_policy.clone(), + allocation_visibility, + persistent_allocations: false, + }) + .map_err(InnerError::DmaClient)?; + Some(client) + } else { + None + }; + let dma_client = self .dma_client_spawner .new_client(DmaClientParameters { - device_name: format!("nvme_{}", pci_id), - lower_vtl_policy: LowerVtlPermissionPolicy::Any, - allocation_visibility: if self.is_isolated { - AllocationVisibility::Shared - } else { - AllocationVisibility::Private - }, + device_name, + lower_vtl_policy, + allocation_visibility, persistent_allocations: self.save_restore_supported, }) .map_err(InnerError::DmaClient)?; - let device = VfioDevice::new(&self.driver_source, entry.key(), dma_client) - .instrument(tracing::info_span!("vfio_device_open", pci_id)) - .await - .map_err(InnerError::Vfio)?; + let device = VfioDevice::new( + &self.driver_source, + entry.key(), + dma_client, + fallback_dma_client, + ) + .instrument(tracing::info_span!("vfio_device_open", pci_id)) + .await + .map_err(InnerError::Vfio)?; let driver = nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device) @@ -380,10 +423,15 @@ impl NvmeManagerWorker { // This code can wait on each VFIO device until it is arrived. // A potential optimization would be to delay VFIO operation // until it is ready, but a redesign of VfioDevice is needed. - let vfio_device = - VfioDevice::restore(&self.driver_source, &disk.pci_id.clone(), true, dma_client) - .instrument(tracing::info_span!("vfio_device_restore", pci_id)) - .await?; + let vfio_device = VfioDevice::restore( + &self.driver_source, + &disk.pci_id.clone(), + true, + dma_client, + None, + ) + .instrument(tracing::info_span!("vfio_device_restore", pci_id)) + .await?; let nvme_driver = nvme_driver::NvmeDriver::restore( &self.driver_source, diff --git a/openhcl/underhill_core/src/options.rs b/openhcl/underhill_core/src/options.rs index 5dd9254f8c..647f1b8442 100644 --- a/openhcl/underhill_core/src/options.rs +++ b/openhcl/underhill_core/src/options.rs @@ -140,7 +140,7 @@ pub struct Options { pub no_sidecar_hotplug: bool, /// (OPENHCL_NVME_KEEP_ALIVE=1) Enable nvme keep alive when servicing. - pub nvme_keep_alive: bool, + pub nvme_keepalive: bool, /// (OPENHCL_TEST_CONFIG=\) /// Test configurations are designed to replicate specific behaviors and @@ -237,7 +237,7 @@ impl Options { let no_sidecar_hotplug = parse_legacy_env_bool("OPENHCL_NO_SIDECAR_HOTPLUG"); let gdbstub = parse_legacy_env_bool("OPENHCL_GDBSTUB"); let gdbstub_port = parse_legacy_env_number("OPENHCL_GDBSTUB_PORT")?.map(|x| x as u32); - let nvme_keep_alive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE"); + let nvme_keepalive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE"); let test_configuration = parse_env_string("OPENHCL_TEST_CONFIG").and_then(|x| { x.to_string_lossy() .parse::() @@ -304,7 +304,7 @@ impl Options { hide_isolation, halt_on_guest_halt, no_sidecar_hotplug, - nvme_keep_alive, + nvme_keepalive, test_configuration, disable_uefi_frontpage, }) diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs index 6c668a387f..3913e8e657 100644 --- a/openhcl/underhill_core/src/worker.rs +++ b/openhcl/underhill_core/src/worker.rs @@ -280,8 +280,8 @@ pub struct UnderhillEnvCfg { pub gdbstub: bool, /// Hide the isolation mode from the guest. pub hide_isolation: bool, - /// Enable nvme keep alive. - pub nvme_keep_alive: bool, + /// Enable nvme keep-alive. + pub nvme_keepalive: bool, /// test configuration pub test_configuration: Option, @@ -1839,7 +1839,10 @@ async fn new_underhill_vm( // TODO: reevaluate enablement of nvme save restore when private pool // save restore to bootshim is available. let private_pool_available = !runtime_params.private_pool_ranges().is_empty(); - let save_restore_supported = env_cfg.nvme_keep_alive && private_pool_available; + // Two separate flags because: + // - private pool alone can be used for other purposes; + // - host must explicitly indicate that keepalive is supported (compatibility). + let save_restore_supported = env_cfg.nvme_keepalive && private_pool_available; let manager = NvmeManager::new( &driver_source, @@ -3004,7 +3007,7 @@ async fn new_underhill_vm( control_send, _periodic_telemetry_task: periodic_telemetry_task, - nvme_keep_alive: env_cfg.nvme_keep_alive, + nvme_keepalive: env_cfg.nvme_keepalive, test_configuration: env_cfg.test_configuration, dma_manager, }; diff --git a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs index 1463767ebb..0e8e5c2b29 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs @@ -53,6 +53,11 @@ impl Dev self.device.dma_client() } + // Not implemented for Fuzzer yet. + fn fallback_dma_client(&self) -> Option> { + None + } + /// Arbitrarily decide to passthrough or return arbitrary value. fn max_interrupt_count(&self) -> u32 { // Case: Fuzz response diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 558be0a035..2dfc57e508 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -288,6 +288,11 @@ impl NvmeDriver { ) .context("failed to create admin queue pair")?; + if admin.fallback_used() { + // Unconditionally disable keepalive if fallback allocator was involved. + self.nvme_keepalive = false; + tracing::info!("YSP: nvme_keepalive explicitly disabled"); + } let admin = worker.admin.insert(admin); // Register the admin queue with the controller. @@ -433,7 +438,7 @@ impl NvmeDriver { // Pre-create the IO queue 1 for CPU 0. The other queues will be created // lazily. Numbering for I/O queues starts with 1 (0 is Admin). - let issuer = worker + let (issuer, fallback_alloc) = worker .create_io_queue(&mut state, 0) .await .context("failed to create io queue 1")?; @@ -807,7 +812,7 @@ impl DriverWorkerTask { return; } - let issuer = match self + let (issuer, fallback_alloc) = match self .create_io_queue(state, cpu) .instrument(info_span!("create_nvme_io_queue", cpu)) .await @@ -828,7 +833,7 @@ impl DriverWorkerTask { error = err.as_ref() as &dyn std::error::Error, "failed to create io queue, falling back" ); - fallback.clone() + (fallback.clone(), false) } }; @@ -842,7 +847,7 @@ impl DriverWorkerTask { &mut self, state: &mut WorkerState, cpu: u32, - ) -> anyhow::Result { + ) -> anyhow::Result<(IoIssuer, bool)> { if self.io.len() >= state.max_io_queues as usize { anyhow::bail!("no more io queues available"); } @@ -868,6 +873,7 @@ impl DriverWorkerTask { self.registers.clone(), ) .with_context(|| format!("failed to create io queue pair {qid}"))?; + let fallback_used = queue.fallback_used(); let io_sq_addr = queue.sq_addr(); let io_cq_addr = queue.cq_addr(); @@ -939,10 +945,12 @@ impl DriverWorkerTask { return Err(err); } - Ok(IoIssuer { + Ok((IoIssuer { issuer: io_queue.queue.issuer().clone(), cpu, - }) + }, + fallback_used, + )) } /// Save NVMe driver state for servicing. diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 95ee213b1a..b52a6dc3b0 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -45,6 +45,13 @@ use zerocopy::FromZeros; /// Value for unused PRP entries, to catch/mitigate buffer size mismatches. const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1); +/// Memory allocator errors. +#[derive(Debug, Error)] +pub enum AllocatorError { + #[error("no fallback allocator")] + NoFallbackAllocator, +} + pub(crate) struct QueuePair { task: Task, cancel: Cancel, @@ -53,6 +60,7 @@ pub(crate) struct QueuePair { qid: u16, sq_entries: u16, cq_entries: u16, + fallback_alloc_used: bool, } impl Inspect for QueuePair { @@ -65,6 +73,7 @@ impl Inspect for QueuePair { qid: _, sq_entries: _, cq_entries: _, + fallback_alloc_used: _, } = self; issuer.send.send(Req::Inspect(req.defer())); } @@ -181,18 +190,29 @@ impl QueuePair { interrupt: DeviceInterrupt, registers: Arc>, ) -> anyhow::Result { + assert!(sq_entries <= Self::MAX_SQ_ENTRIES); + assert!(cq_entries <= Self::MAX_CQ_ENTRIES); + + let mut fallback_involved = false; let total_size = QueuePair::SQ_SIZE + QueuePair::CQ_SIZE + QueuePair::PER_QUEUE_PAGES * PAGE_SIZE; let dma_client = device.dma_client(); let mem = dma_client .allocate_dma_buffer(total_size) - .context("failed to allocate memory for queues")?; - - assert!(sq_entries <= Self::MAX_SQ_ENTRIES); - assert!(cq_entries <= Self::MAX_CQ_ENTRIES); + .context("failed to allocate memory for the queues") + .or_else(|_| { + fallback_involved = true; + device.fallback_dma_client().map_or( + Err(AllocatorError::NoFallbackAllocator.into()), + |f| { + f.allocate_dma_buffer(total_size) + .context("fallback allocator also failed") + }, + ) + }); QueuePair::new_or_restore( - spawner, qid, sq_entries, cq_entries, interrupt, registers, mem, None, + spawner, qid, sq_entries, cq_entries, interrupt, registers, mem?, None, fallback_involved, ) } @@ -206,6 +226,7 @@ impl QueuePair { registers: Arc>, mem: MemoryBlock, saved_state: Option<&QueueHandlerSavedState>, + fallback_alloc_used: bool, ) -> anyhow::Result { // MemoryBlock is either allocated or restored prior calling here. let sq_mem_block = mem.subblock(0, QueuePair::SQ_SIZE); @@ -255,6 +276,7 @@ impl QueuePair { qid, sq_entries, cq_entries, + fallback_alloc_used, }) } @@ -275,6 +297,11 @@ impl QueuePair { self.task.await } + /// Indicates that fallback allocator was used for this queue pair. + pub fn fallback_used(&self) -> bool { + self.fallback_alloc_used + } + /// Save queue pair state for servicing. pub async fn save(&self) -> anyhow::Result { // Return error if the queue does not have any memory allocated. @@ -321,6 +348,7 @@ impl QueuePair { registers, mem, Some(handler_data), + false, ) } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index afaef105d7..16b827c46f 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -366,6 +366,11 @@ impl Dev self.device.dma_client() } + // TODO: We should add one for testing. + fn fallback_dma_client(&self) -> Option> { + None + } + fn max_interrupt_count(&self) -> u32 { self.device.max_interrupt_count() } diff --git a/vm/devices/user_driver/src/lib.rs b/vm/devices/user_driver/src/lib.rs index 2a451bc055..1461c382e8 100644 --- a/vm/devices/user_driver/src/lib.rs +++ b/vm/devices/user_driver/src/lib.rs @@ -33,6 +33,9 @@ pub trait DeviceBacking: 'static + Send + Inspect { /// DMA Client for the device. fn dma_client(&self) -> Arc; + /// Fallback DMA Client for the device. + fn fallback_dma_client(&self) -> Option>; + /// Returns the maximum number of interrupts that can be mapped. fn max_interrupt_count(&self) -> u32; diff --git a/vm/devices/user_driver/src/vfio.rs b/vm/devices/user_driver/src/vfio.rs index 0bc6a7670b..137d1db334 100644 --- a/vm/devices/user_driver/src/vfio.rs +++ b/vm/devices/user_driver/src/vfio.rs @@ -57,6 +57,7 @@ pub struct VfioDevice { #[inspect(skip)] config_space: vfio_sys::RegionInfo, dma_client: Arc, + fallback_dma_client: Option>, } #[derive(Inspect)] @@ -74,8 +75,16 @@ impl VfioDevice { driver_source: &VmTaskDriverSource, pci_id: &str, dma_client: Arc, + fallback_dma_client: Option>, ) -> anyhow::Result { - Self::restore(driver_source, pci_id, false, dma_client).await + Self::restore( + driver_source, + pci_id, + false, + dma_client, + fallback_dma_client, + ) + .await } /// Creates a new VFIO-backed device for the PCI device with `pci_id`. @@ -83,8 +92,9 @@ impl VfioDevice { pub async fn restore( driver_source: &VmTaskDriverSource, pci_id: &str, - keepalive: bool, + vf_keepalive: bool, dma_client: Arc, + fallback_dma_client: Option>, ) -> anyhow::Result { let path = Path::new("/sys/bus/pci/devices").join(pci_id); @@ -110,7 +120,7 @@ impl VfioDevice { } container.set_iommu(IommuType::NoIommu)?; - if keepalive { + if vf_keepalive { // Prevent physical hardware interaction when restoring. group.set_keep_alive(pci_id)?; } @@ -131,6 +141,7 @@ impl VfioDevice { driver_source: driver_source.clone(), interrupts: Vec::new(), dma_client, + fallback_dma_client, }; // Ensure bus master enable and memory space enable are set, and that @@ -235,6 +246,10 @@ impl DeviceBacking for VfioDevice { self.dma_client.clone() } + fn fallback_dma_client(&self) -> Option> { + self.fallback_dma_client.clone() + } + fn max_interrupt_count(&self) -> u32 { self.msix_info.count } diff --git a/vm/devices/user_driver_emulated_mock/src/lib.rs b/vm/devices/user_driver_emulated_mock/src/lib.rs index fb68810cd5..0ed650d957 100644 --- a/vm/devices/user_driver_emulated_mock/src/lib.rs +++ b/vm/devices/user_driver_emulated_mock/src/lib.rs @@ -162,6 +162,11 @@ impl Option> { + None + } + fn max_interrupt_count(&self) -> u32 { self.controller.events.len() as u32 } From 0f1227523153a5b99dd1071c6d75a0435fb1c909 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 24 Apr 2025 00:19:44 +0000 Subject: [PATCH 12/18] Simplify some return values --- openhcl/underhill_core/src/nvme_manager.rs | 4 ++- .../disk_nvme/nvme_driver/src/driver.rs | 30 +++++++++++-------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs index 6bea0f422f..47e09205e1 100644 --- a/openhcl/underhill_core/src/nvme_manager.rs +++ b/openhcl/underhill_core/src/nvme_manager.rs @@ -277,7 +277,8 @@ impl NvmeManagerWorker { // Prevent devices from originating controller reset in drop(). dev.update_servicing_flags(do_not_reset); } - break (span, nvme_keepalive); + // Use final combined flag to report back. + break (span, do_not_reset); } Request::KeepAliveStatus(rpc) => { rpc.complete(self.save_restore_supported); @@ -289,6 +290,7 @@ impl NvmeManagerWorker { // because the Shutdown request is never sent. // // Tear down all the devices if nvme_keepalive is not set. + // TODO: Since the loop above is returning combined flag, this condition can be simplified. if !nvme_keepalive || !self.save_restore_supported { async { join_all(self.devices.drain().map(|(pci_id, driver)| { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 2dfc57e508..3e4d344f3e 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -92,6 +92,8 @@ struct WorkerState { qsize: u16, #[inspect(skip)] async_event_task: Task<()>, + /// Tracks if fallback memory allocator was ever used. + fallback_used: bool, } /// An error restoring from saved state. @@ -252,7 +254,7 @@ impl NvmeDriver { io_issuers, rescan_event: Default::default(), namespaces: vec![], - nvme_keepalive: false, + nvme_keepalive: false, // In the beginning always assume it's not supported. }) } @@ -288,8 +290,9 @@ impl NvmeDriver { ) .context("failed to create admin queue pair")?; - if admin.fallback_used() { - // Unconditionally disable keepalive if fallback allocator was involved. + let fallback_used = admin.fallback_used(); + if fallback_used { + // Unconditionally override keepalive if fallback allocator was involved. self.nvme_keepalive = false; tracing::info!("YSP: nvme_keepalive explicitly disabled"); } @@ -432,13 +435,14 @@ impl NvmeDriver { qsize, async_event_task, max_io_queues, + fallback_used, // May be updated in create_io_queue() below. }; self.admin = Some(admin.issuer().clone()); // Pre-create the IO queue 1 for CPU 0. The other queues will be created // lazily. Numbering for I/O queues starts with 1 (0 is Admin). - let (issuer, fallback_alloc) = worker + let issuer = worker .create_io_queue(&mut state, 0) .await .context("failed to create io queue 1")?; @@ -586,7 +590,7 @@ impl NvmeDriver { io_issuers, rescan_event: Default::default(), namespaces: vec![], - nvme_keepalive: true, + nvme_keepalive: true, // We know it is supported because we're in restore(). }; let task = &mut this.task.as_mut().unwrap(); @@ -640,6 +644,7 @@ impl NvmeDriver { qsize: saved_state.worker_data.qsize, async_event_task, max_io_queues: saved_state.worker_data.max_io_queues, + fallback_used: false, // Because this is restore() which implies that everything was saved. }; this.admin = Some(admin.issuer().clone()); @@ -812,7 +817,7 @@ impl DriverWorkerTask { return; } - let (issuer, fallback_alloc) = match self + let issuer = match self .create_io_queue(state, cpu) .instrument(info_span!("create_nvme_io_queue", cpu)) .await @@ -833,7 +838,7 @@ impl DriverWorkerTask { error = err.as_ref() as &dyn std::error::Error, "failed to create io queue, falling back" ); - (fallback.clone(), false) + fallback.clone() } }; @@ -847,7 +852,7 @@ impl DriverWorkerTask { &mut self, state: &mut WorkerState, cpu: u32, - ) -> anyhow::Result<(IoIssuer, bool)> { + ) -> anyhow::Result { if self.io.len() >= state.max_io_queues as usize { anyhow::bail!("no more io queues available"); } @@ -873,7 +878,8 @@ impl DriverWorkerTask { self.registers.clone(), ) .with_context(|| format!("failed to create io queue pair {qid}"))?; - let fallback_used = queue.fallback_used(); + // Keep tracking if fallback allocator was ever used. + state.fallback_used |= queue.fallback_used(); let io_sq_addr = queue.sq_addr(); let io_cq_addr = queue.cq_addr(); @@ -945,12 +951,10 @@ impl DriverWorkerTask { return Err(err); } - Ok((IoIssuer { + Ok(IoIssuer { issuer: io_queue.queue.issuer().clone(), cpu, - }, - fallback_used, - )) + }) } /// Save NVMe driver state for servicing. From 0d796baf51884d1a0ca4ab19b455d60c4c28e311 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 24 Apr 2025 01:17:34 +0000 Subject: [PATCH 13/18] Closer to final code --- openhcl/underhill_core/src/dispatch/mod.rs | 1 - openhcl/underhill_core/src/nvme_manager.rs | 10 ++++++++++ .../disk_nvme/nvme_driver/src/driver.rs | 20 +++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 04db01ad34..7cf443ff32 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -7,7 +7,6 @@ mod pci_shutdown; pub mod vtl2_settings_worker; use self::vtl2_settings_worker::DeviceInterfaces; -use crate::nvme_manager; use crate::ControlRequest; use crate::emuplat::EmuplatServicing; use crate::emuplat::netvsp::RuntimeSavedState; diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs index 47e09205e1..6f61f8bc4d 100644 --- a/openhcl/underhill_core/src/nvme_manager.rs +++ b/openhcl/underhill_core/src/nvme_manager.rs @@ -281,6 +281,16 @@ impl NvmeManagerWorker { break (span, do_not_reset); } Request::KeepAliveStatus(rpc) => { + let mut fallback_used = false; + for (_s, dev) in self.devices.iter_mut() { + // Prevent devices from originating controller reset in drop(). + fallback_used |= dev.query_fallback_used().await; + } + if fallback_used { + // If any of the attached devices ever used fallback allocator, + // update internal tracking as well, and return the result. + self.save_restore_supported = false; + } rpc.complete(self.save_restore_supported); } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 3e4d344f3e..023631657f 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -163,6 +163,8 @@ enum NvmeWorkerRequest { CreateIssuer(Rpc), /// Save worker state. Save(Rpc<(), anyhow::Result>), + /// Query worker state. + QueryAllocatorFallback(Rpc<(), bool>), } impl NvmeDriver { @@ -703,6 +705,21 @@ impl NvmeDriver { pub fn update_servicing_flags(&mut self, nvme_keepalive: bool) { self.nvme_keepalive = nvme_keepalive; } + + /// Queries worker task if memory allocator ever fell back. + pub async fn query_fallback_used(&self) -> bool { + let fb = self + .io_issuers + .send + .call(NvmeWorkerRequest::QueryAllocatorFallback, ()) + .await; + match fb { + Ok(fallback_used) => fallback_used, + Err(_) => { + false + } + } + } } async fn handle_asynchronous_events( @@ -802,6 +819,9 @@ impl AsyncRun for DriverWorkerTask { Some(NvmeWorkerRequest::Save(rpc)) => { rpc.handle(async |_| self.save(state).await).await } + Some(NvmeWorkerRequest::QueryAllocatorFallback(rpc)) => { + rpc.complete(state.fallback_used) + } None => break, } } From b5a2ba1ddaf4f7e203de5fb35d84eb74740a1e3d Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Thu, 24 Apr 2025 18:36:48 +0000 Subject: [PATCH 14/18] Validated fallback allocator for NVMe, more tracing --- openhcl/underhill_core/src/dispatch/mod.rs | 11 ++--- openhcl/underhill_core/src/nvme_manager.rs | 40 ++++++++++++++----- .../disk_nvme/nvme_driver/src/driver.rs | 30 ++++++-------- .../disk_nvme/nvme_driver/src/queue_pair.rs | 32 +++++++++------ 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 7cf443ff32..6c49dae1a2 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -494,13 +494,14 @@ impl LoadedVm { // NOTE: This is set via the corresponding env arg, as this feature is // experimental. - let nvme_keepalive_runtime_enabled = if let Some(nvme_manager) = self.nvme_manager.as_ref() { - nvme_manager.is_keepalive_still_enabled().await + let nvme_keepalive_runtime = if let Some(nvme_manager) = self.nvme_manager.as_ref() { + nvme_manager.query_keepalive_runtime_status().await } else { false }; - let nvme_keepalive = self.nvme_keepalive && capabilities_flags.enable_nvme_keepalive() && nvme_keepalive_runtime_enabled; - tracing::info!("YSP: {nvme_keepalive_runtime_enabled} {nvme_keepalive}"); + let nvme_keepalive = self.nvme_keepalive + && capabilities_flags.enable_nvme_keepalive() + && nvme_keepalive_runtime; // Do everything before the log flush under a span. let r = async { @@ -533,7 +534,7 @@ impl LoadedVm { if let Some(nvme_manager) = self.nvme_manager.take() { nvme_manager .shutdown(nvme_keepalive) - .instrument(tracing::info_span!("shutdown_nvme_vfio", %correlation_id, %nvme_keepalive)) + .instrument(tracing::info_span!("shutdown_nvme_vfio", %correlation_id, %nvme_keepalive, %nvme_keepalive_runtime)) .await; } }; diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs index 6f61f8bc4d..4144b66a7c 100644 --- a/openhcl/underhill_core/src/nvme_manager.rs +++ b/openhcl/underhill_core/src/nvme_manager.rs @@ -129,9 +129,18 @@ impl NvmeManager { } /// Save could have been disabled if fallback allocator was used. - pub async fn is_keepalive_still_enabled(&self) -> bool { - let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await { - Ok(s) => s, + pub async fn query_keepalive_runtime_status(&self) -> bool { + let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await + { + Ok(s) => { + if s.fallback_alloc > 0 { + tracing::warn!( + mem_size = s.fallback_alloc, + "fallback mem allocator was used" + ); + } + s.nvme_keepalive + } Err(_) => false, }; self.save_restore_supported && worker_save_restore @@ -177,6 +186,13 @@ impl NvmeManager { } } +pub struct NvmeKeepaliveRuntimeStatus { + /// Indicates if keepalive still enabled. + pub nvme_keepalive: bool, + /// How much memory (bytes) was allocated by fallback allocator. + pub fallback_alloc: u64, +} + enum Request { Inspect(inspect::Deferred), ForceLoadDriver(inspect::DeferredUpdate), @@ -186,7 +202,7 @@ enum Request { span: tracing::Span, nvme_keepalive: bool, }, - KeepAliveStatus(Rpc<(), bool>), + KeepAliveStatus(Rpc<(), NvmeKeepaliveRuntimeStatus>), } #[derive(Debug, Clone)] @@ -281,17 +297,19 @@ impl NvmeManagerWorker { break (span, do_not_reset); } Request::KeepAliveStatus(rpc) => { - let mut fallback_used = false; + let mut fallback_alloc = 0u64; for (_s, dev) in self.devices.iter_mut() { - // Prevent devices from originating controller reset in drop(). - fallback_used |= dev.query_fallback_used().await; + fallback_alloc += dev.query_fallback_alloc().await; } - if fallback_used { + if fallback_alloc > 0 { // If any of the attached devices ever used fallback allocator, - // update internal tracking as well, and return the result. + // update internal tracking and return the result. self.save_restore_supported = false; } - rpc.complete(self.save_restore_supported); + rpc.complete(NvmeKeepaliveRuntimeStatus { + nvme_keepalive: self.save_restore_supported, + fallback_alloc, + }); } } }; @@ -334,7 +352,7 @@ impl NvmeManagerWorker { // Create a fallback allocator which uses heap. // When fallback allocator is involved, nvme_keepalive // will be implicitly disabled. - let fallback_dma_client = if self.save_restore_supported { + let fallback_dma_client = if self.save_restore_supported && !self.is_isolated { let client: Arc = self .dma_client_spawner .new_client(DmaClientParameters { diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index 023631657f..de94ccc423 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -92,8 +92,8 @@ struct WorkerState { qsize: u16, #[inspect(skip)] async_event_task: Task<()>, - /// Tracks if fallback memory allocator was ever used. - fallback_used: bool, + /// Tracks how much memory was allocated using fallback allocator. + fallback_alloc: u64, } /// An error restoring from saved state. @@ -163,8 +163,8 @@ enum NvmeWorkerRequest { CreateIssuer(Rpc), /// Save worker state. Save(Rpc<(), anyhow::Result>), - /// Query worker state. - QueryAllocatorFallback(Rpc<(), bool>), + /// Query how much memory was allocated with fallback allocator. + QueryAllocatorFallback(Rpc<(), u64>), } impl NvmeDriver { @@ -292,11 +292,10 @@ impl NvmeDriver { ) .context("failed to create admin queue pair")?; - let fallback_used = admin.fallback_used(); - if fallback_used { + let fallback_alloc = admin.fallback_alloc(); + if fallback_alloc > 0 { // Unconditionally override keepalive if fallback allocator was involved. self.nvme_keepalive = false; - tracing::info!("YSP: nvme_keepalive explicitly disabled"); } let admin = worker.admin.insert(admin); @@ -437,7 +436,7 @@ impl NvmeDriver { qsize, async_event_task, max_io_queues, - fallback_used, // May be updated in create_io_queue() below. + fallback_alloc, // May be updated in create_io_queue() below. }; self.admin = Some(admin.issuer().clone()); @@ -646,7 +645,7 @@ impl NvmeDriver { qsize: saved_state.worker_data.qsize, async_event_task, max_io_queues: saved_state.worker_data.max_io_queues, - fallback_used: false, // Because this is restore() which implies that everything was saved. + fallback_alloc: 0, // This is restore() which implies that everything was saved. }; this.admin = Some(admin.issuer().clone()); @@ -707,18 +706,13 @@ impl NvmeDriver { } /// Queries worker task if memory allocator ever fell back. - pub async fn query_fallback_used(&self) -> bool { + pub async fn query_fallback_alloc(&self) -> u64 { let fb = self .io_issuers .send .call(NvmeWorkerRequest::QueryAllocatorFallback, ()) .await; - match fb { - Ok(fallback_used) => fallback_used, - Err(_) => { - false - } - } + fb.unwrap_or_default() } } @@ -820,7 +814,7 @@ impl AsyncRun for DriverWorkerTask { rpc.handle(async |_| self.save(state).await).await } Some(NvmeWorkerRequest::QueryAllocatorFallback(rpc)) => { - rpc.complete(state.fallback_used) + rpc.complete(state.fallback_alloc) } None => break, } @@ -899,7 +893,7 @@ impl DriverWorkerTask { ) .with_context(|| format!("failed to create io queue pair {qid}"))?; // Keep tracking if fallback allocator was ever used. - state.fallback_used |= queue.fallback_used(); + state.fallback_alloc += queue.fallback_alloc(); let io_sq_addr = queue.sq_addr(); let io_cq_addr = queue.cq_addr(); diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index b52a6dc3b0..4f283747ef 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -48,7 +48,7 @@ const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1); /// Memory allocator errors. #[derive(Debug, Error)] pub enum AllocatorError { - #[error("no fallback allocator")] + #[error("no fallback mem allocator")] NoFallbackAllocator, } @@ -60,7 +60,7 @@ pub(crate) struct QueuePair { qid: u16, sq_entries: u16, cq_entries: u16, - fallback_alloc_used: bool, + fallback_alloc: u64, } impl Inspect for QueuePair { @@ -73,7 +73,7 @@ impl Inspect for QueuePair { qid: _, sq_entries: _, cq_entries: _, - fallback_alloc_used: _, + fallback_alloc: _, } = self; issuer.send.send(Req::Inspect(req.defer())); } @@ -193,7 +193,7 @@ impl QueuePair { assert!(sq_entries <= Self::MAX_SQ_ENTRIES); assert!(cq_entries <= Self::MAX_CQ_ENTRIES); - let mut fallback_involved = false; + let mut fallback_alloc = 0u64; let total_size = QueuePair::SQ_SIZE + QueuePair::CQ_SIZE + QueuePair::PER_QUEUE_PAGES * PAGE_SIZE; let dma_client = device.dma_client(); @@ -201,18 +201,26 @@ impl QueuePair { .allocate_dma_buffer(total_size) .context("failed to allocate memory for the queues") .or_else(|_| { - fallback_involved = true; + fallback_alloc += total_size as u64; device.fallback_dma_client().map_or( Err(AllocatorError::NoFallbackAllocator.into()), |f| { f.allocate_dma_buffer(total_size) - .context("fallback allocator also failed") + .context("fallback mem allocator also failed") }, ) }); QueuePair::new_or_restore( - spawner, qid, sq_entries, cq_entries, interrupt, registers, mem?, None, fallback_involved, + spawner, + qid, + sq_entries, + cq_entries, + interrupt, + registers, + mem?, + None, + fallback_alloc, ) } @@ -226,7 +234,7 @@ impl QueuePair { registers: Arc>, mem: MemoryBlock, saved_state: Option<&QueueHandlerSavedState>, - fallback_alloc_used: bool, + fallback_alloc: u64, ) -> anyhow::Result { // MemoryBlock is either allocated or restored prior calling here. let sq_mem_block = mem.subblock(0, QueuePair::SQ_SIZE); @@ -276,7 +284,7 @@ impl QueuePair { qid, sq_entries, cq_entries, - fallback_alloc_used, + fallback_alloc, }) } @@ -298,8 +306,8 @@ impl QueuePair { } /// Indicates that fallback allocator was used for this queue pair. - pub fn fallback_used(&self) -> bool { - self.fallback_alloc_used + pub fn fallback_alloc(&self) -> u64 { + self.fallback_alloc } /// Save queue pair state for servicing. @@ -348,7 +356,7 @@ impl QueuePair { registers, mem, Some(handler_data), - false, + 0u64, ) } } From e4e7ffc519d5807e12d9b7138d6c60a3cfbce828 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Fri, 25 Apr 2025 02:58:35 +0000 Subject: [PATCH 15/18] Move fallback logic to dma clients --- Cargo.lock | 2 + .../lower_vtl_permissions_guard/src/lib.rs | 15 ++ openhcl/openhcl_dma_manager/Cargo.toml | 2 + openhcl/openhcl_dma_manager/src/lib.rs | 95 +++++++++++- openhcl/underhill_core/src/dispatch/mod.rs | 7 + openhcl/underhill_core/src/emuplat/netvsp.rs | 2 +- openhcl/underhill_core/src/nvme_manager.rs | 136 ++++++++++-------- openhcl/underhill_core/src/worker.rs | 87 ++++++----- .../nvme_driver/fuzz/fuzz_emulated_device.rs | 5 - .../disk_nvme/nvme_driver/src/driver.rs | 30 ++-- .../disk_nvme/nvme_driver/src/queue_pair.rs | 40 +----- .../disk_nvme/nvme_driver/src/tests.rs | 5 - vm/devices/user_driver/src/lib.rs | 20 ++- vm/devices/user_driver/src/lockmem.rs | 15 ++ vm/devices/user_driver/src/vfio.rs | 17 +-- .../user_driver_emulated_mock/src/lib.rs | 5 - vm/page_pool_alloc/src/lib.rs | 20 ++- 17 files changed, 313 insertions(+), 190 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 13cbe2d9d2..ab0f6572ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4805,6 +4805,8 @@ dependencies = [ "memory_range", "mesh", "page_pool_alloc", + "parking_lot", + "thiserror 2.0.12", "user_driver", "virt", "vmcore", diff --git a/openhcl/lower_vtl_permissions_guard/src/lib.rs b/openhcl/lower_vtl_permissions_guard/src/lib.rs index a8678fbc71..1b2c08f048 100644 --- a/openhcl/lower_vtl_permissions_guard/src/lib.rs +++ b/openhcl/lower_vtl_permissions_guard/src/lib.rs @@ -108,4 +108,19 @@ impl DmaClient for LowerVtlMemorySpawner { fn attach_pending_buffers(&self) -> Result> { anyhow::bail!("restore is not supported for LowerVtlMemorySpawner") } + + /// Query if this client supports persistent allocations. + fn is_persistent(&self) -> bool { + false + } + + /// How much memory was allocated during session. + fn alloc_size(&self) -> u64 { + 0 + } + + /// How much backup memory was allocated during session (fallback). + fn fallback_alloc_size(&self) -> u64 { + 0 + } } diff --git a/openhcl/openhcl_dma_manager/Cargo.toml b/openhcl/openhcl_dma_manager/Cargo.toml index 97b9821055..6e61581bf5 100644 --- a/openhcl/openhcl_dma_manager/Cargo.toml +++ b/openhcl/openhcl_dma_manager/Cargo.toml @@ -16,6 +16,8 @@ inspect.workspace = true memory_range.workspace = true mesh.workspace = true page_pool_alloc.workspace = true +parking_lot.workspace = true +thiserror.workspace = true user_driver.workspace = true virt.workspace = true vmcore.workspace = true diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs index 64db4d2b0a..819817edc6 100644 --- a/openhcl/openhcl_dma_manager/src/lib.rs +++ b/openhcl/openhcl_dma_manager/src/lib.rs @@ -16,10 +16,21 @@ use memory_range::MemoryRange; use page_pool_alloc::PagePool; use page_pool_alloc::PagePoolAllocator; use page_pool_alloc::PagePoolAllocatorSpawner; +use parking_lot::Mutex; use std::sync::Arc; +use thiserror::Error; use user_driver::DmaClient; +use user_driver::DmaClientAllocStats; use user_driver::lockmem::LockedMemorySpawner; +/// DMA manager errors. +#[derive(Debug, Error)] +pub enum DmaManagerError { + /// No memory. + #[error("no memory")] + NoMemory, +} + /// Save restore support for [`OpenhclDmaManager`]. pub mod save_restore { use super::OpenhclDmaManager; @@ -189,7 +200,11 @@ impl virt::VtlMemoryProtection for DmaManagerLowerVtl { } impl DmaManagerInner { - fn new_dma_client(&self, params: DmaClientParameters) -> anyhow::Result> { + fn new_dma_client( + &self, + params: DmaClientParameters, + fallback: Option>, + ) -> anyhow::Result> { // Allocate the inner client that actually performs the allocations. let backing = { let DmaClientParameters { @@ -297,7 +312,15 @@ impl DmaManagerInner { } }; - Ok(Arc::new(OpenhclDmaClient { backing, params })) + Ok(Arc::new(OpenhclDmaClient { + backing, + params, + fallback, + inner_stats: Mutex::new(DmaClientAllocStats { + total_alloc: 0, + fallback_alloc: 0, + }), + })) } } @@ -346,8 +369,17 @@ impl OpenhclDmaManager { /// Creates a new DMA client with the given device name and lower VTL /// policy. - pub fn new_client(&self, params: DmaClientParameters) -> anyhow::Result> { - self.inner.new_dma_client(params) + pub fn new_client( + &self, + params: DmaClientParameters, + fallback_params: Option, + ) -> anyhow::Result> { + let fb = if let Some(fb1) = fallback_params { + self.inner.new_dma_client(fb1, None).ok() + } else { + None + }; + self.inner.new_dma_client(params, fb) } /// Returns a [`DmaClientSpawner`] for creating DMA clients. @@ -385,8 +417,17 @@ pub struct DmaClientSpawner { impl DmaClientSpawner { /// Creates a new DMA client with the given parameters. - pub fn new_client(&self, params: DmaClientParameters) -> anyhow::Result> { - self.inner.new_dma_client(params) + pub fn new_client( + &self, + params: DmaClientParameters, + fallback_params: Option, + ) -> anyhow::Result> { + let fb = if let Some(fb1) = fallback_params { + self.inner.new_dma_client(fb1, None).ok() + } else { + None + }; + self.inner.new_dma_client(params, fb) } } @@ -429,6 +470,16 @@ impl DmaClientBacking { DmaClientBacking::LockedMemoryLowerVtl(spawner) => spawner.attach_pending_buffers(), } } + + fn is_persistent(&self) -> bool { + match self { + DmaClientBacking::SharedPool(_allocator) => false, + DmaClientBacking::PrivatePool(_allocator) => true, + DmaClientBacking::LockedMemory(_spawner) => false, + DmaClientBacking::PrivatePoolLowerVtl(_spawner) => false, + DmaClientBacking::LockedMemoryLowerVtl(_spawner) => false, + } + } } /// An OpenHCL dma client. This client implements inspect to allow seeing what @@ -437,6 +488,10 @@ impl DmaClientBacking { pub struct OpenhclDmaClient { backing: DmaClientBacking, params: DmaClientParameters, + #[inspect(skip)] // TODO: Skip for now + /// Allocation statistics per client. + inner_stats: Mutex, + fallback: Option>, } impl DmaClient for OpenhclDmaClient { @@ -444,10 +499,36 @@ impl DmaClient for OpenhclDmaClient { &self, total_size: usize, ) -> anyhow::Result { - self.backing.allocate_dma_buffer(total_size) + let mut stats = self.inner_stats.lock(); + stats.total_alloc += total_size as u64; + let mem_block = self.backing.allocate_dma_buffer(total_size).or_else(|_| { + stats.fallback_alloc += total_size as u64; + self.fallback + .as_ref() + .map_or(Err(DmaManagerError::NoMemory.into()), |f| { + f.allocate_dma_buffer(total_size) + }) + }); + + mem_block } fn attach_pending_buffers(&self) -> anyhow::Result> { self.backing.attach_pending_buffers() } + + /// Query if this client supports persistent allocations. + fn is_persistent(&self) -> bool { + self.backing.is_persistent() + } + + /// How much memory was allocated during session. + fn alloc_size(&self) -> u64 { + self.inner_stats.lock().total_alloc + } + + /// How much backup memory was allocated during session (fallback). + fn fallback_alloc_size(&self) -> u64 { + self.inner_stats.lock().fallback_alloc + } } diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs index 6c49dae1a2..152637070b 100644 --- a/openhcl/underhill_core/src/dispatch/mod.rs +++ b/openhcl/underhill_core/src/dispatch/mod.rs @@ -499,6 +499,13 @@ impl LoadedVm { } else { false }; + // Three sources to determine if keepalive can be enabled: + // 1. Host indicates that it is compatible with keepalive + // by setting device tree property when VM starts. + // 2. During servicing the capabilities_flags is also set + // so OpenHCL knows that it wasn't migrated to an older host. + // 3. If we ran out of dedicated DMA memory and used non-persistent + // fallback allocator, disable keepalive altogether. let nvme_keepalive = self.nvme_keepalive && capabilities_flags.enable_nvme_keepalive() && nvme_keepalive_runtime; diff --git a/openhcl/underhill_core/src/emuplat/netvsp.rs b/openhcl/underhill_core/src/emuplat/netvsp.rs index 9398e0fe21..bf517208fb 100644 --- a/openhcl/underhill_core/src/emuplat/netvsp.rs +++ b/openhcl/underhill_core/src/emuplat/netvsp.rs @@ -120,7 +120,7 @@ async fn try_create_mana_device( max_sub_channels: u16, dma_client: Arc, ) -> anyhow::Result> { - let device = VfioDevice::new(driver_source, pci_id, dma_client, None) + let device = VfioDevice::new(driver_source, pci_id, dma_client) .await .context("failed to open device")?; diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs index 4144b66a7c..de8037fef4 100644 --- a/openhcl/underhill_core/src/nvme_manager.rs +++ b/openhcl/underhill_core/src/nvme_manager.rs @@ -30,6 +30,7 @@ use std::sync::Arc; use thiserror::Error; use tracing::Instrument; use user_driver::DmaClient; +use user_driver::DmaClientAllocStats; use user_driver::vfio::VfioDevice; use vm_resource::AsyncResolveResource; use vm_resource::ResourceId; @@ -133,9 +134,9 @@ impl NvmeManager { let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await { Ok(s) => { - if s.fallback_alloc > 0 { + if s.stats.fallback_alloc > 0 { tracing::warn!( - mem_size = s.fallback_alloc, + mem_size = s.stats.fallback_alloc, "fallback mem allocator was used" ); } @@ -187,10 +188,10 @@ impl NvmeManager { } pub struct NvmeKeepaliveRuntimeStatus { - /// Indicates if keepalive still enabled. + /// Indicates if keepalive is still enabled. pub nvme_keepalive: bool, - /// How much memory (bytes) was allocated by fallback allocator. - pub fallback_alloc: u64, + /// Retrieve statistics from connected DMA client. + pub stats: DmaClientAllocStats, } enum Request { @@ -297,18 +298,23 @@ impl NvmeManagerWorker { break (span, do_not_reset); } Request::KeepAliveStatus(rpc) => { - let mut fallback_alloc = 0u64; + let mut stats = DmaClientAllocStats { + total_alloc: 0, + fallback_alloc: 0, + }; for (_s, dev) in self.devices.iter_mut() { - fallback_alloc += dev.query_fallback_alloc().await; + let dev_stats = dev.get_alloc_stats().await; + stats.total_alloc += dev_stats.total_alloc; + stats.fallback_alloc += dev_stats.fallback_alloc; } - if fallback_alloc > 0 { + if stats.fallback_alloc > 0 { // If any of the attached devices ever used fallback allocator, // update internal tracking and return the result. self.save_restore_supported = false; } rpc.complete(NvmeKeepaliveRuntimeStatus { nvme_keepalive: self.save_restore_supported, - fallback_alloc, + stats, }); } } @@ -341,51 +347,45 @@ impl NvmeManagerWorker { hash_map::Entry::Occupied(entry) => entry.into_mut(), hash_map::Entry::Vacant(entry) => { let device_name = format!("nvme_{}", pci_id); + let lower_vtl_policy = LowerVtlPermissionPolicy::Any; let allocation_visibility = if self.is_isolated { AllocationVisibility::Shared } else { AllocationVisibility::Private }; - let lower_vtl_policy = LowerVtlPermissionPolicy::Any; + + // Main client parameters. + let main_params = DmaClientParameters { + device_name: device_name.clone(), + lower_vtl_policy: lower_vtl_policy.clone(), + allocation_visibility, + persistent_allocations: self.save_restore_supported, + }; // Persistent allocations use fixed size memory. // Create a fallback allocator which uses heap. // When fallback allocator is involved, nvme_keepalive // will be implicitly disabled. - let fallback_dma_client = if self.save_restore_supported && !self.is_isolated { - let client: Arc = self - .dma_client_spawner - .new_client(DmaClientParameters { - device_name: device_name.clone(), - lower_vtl_policy: lower_vtl_policy.clone(), - allocation_visibility, - persistent_allocations: false, - }) - .map_err(InnerError::DmaClient)?; - Some(client) + let fallback_params = if self.save_restore_supported && !self.is_isolated { + Some(DmaClientParameters { + device_name, + lower_vtl_policy, + allocation_visibility, + persistent_allocations: false, + }) } else { None }; let dma_client = self .dma_client_spawner - .new_client(DmaClientParameters { - device_name, - lower_vtl_policy, - allocation_visibility, - persistent_allocations: self.save_restore_supported, - }) + .new_client(main_params, fallback_params) .map_err(InnerError::DmaClient)?; - let device = VfioDevice::new( - &self.driver_source, - entry.key(), - dma_client, - fallback_dma_client, - ) - .instrument(tracing::info_span!("vfio_device_open", pci_id)) - .await - .map_err(InnerError::Vfio)?; + let device = VfioDevice::new(&self.driver_source, entry.key(), dma_client) + .instrument(tracing::info_span!("vfio_device_open", pci_id)) + .await + .map_err(InnerError::Vfio)?; let driver = nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device) @@ -414,6 +414,45 @@ impl NvmeManagerWorker { .map_err(|source| InnerError::Namespace { nsid, source }) } + /// Copy of the code from get_driver. + fn get_dma_client(&self, pci_id: String) -> Result, InnerError> { + let device_name = format!("nvme_{}", pci_id); + let lower_vtl_policy = LowerVtlPermissionPolicy::Any; + let allocation_visibility = if self.is_isolated { + AllocationVisibility::Shared + } else { + AllocationVisibility::Private + }; + + // Main client parameters. + let main_params = DmaClientParameters { + device_name: device_name.clone(), + lower_vtl_policy: lower_vtl_policy.clone(), + allocation_visibility, + persistent_allocations: self.save_restore_supported, + }; + + // Persistent allocations use fixed size memory. + // Create a fallback allocator which uses heap. + // When fallback allocator is involved, nvme_keepalive + // will be implicitly disabled. + let fallback_params = if self.save_restore_supported && !self.is_isolated { + Some(DmaClientParameters { + device_name, + lower_vtl_policy, + allocation_visibility, + persistent_allocations: false, + }) + } else { + None + }; + + Ok(self + .dma_client_spawner + .new_client(main_params, fallback_params) + .map_err(InnerError::DmaClient)?) + } + /// Saves NVMe device's states into buffer during servicing. pub async fn save(&mut self) -> anyhow::Result { let mut nvme_disks: Vec = Vec::new(); @@ -438,30 +477,15 @@ impl NvmeManagerWorker { self.devices = HashMap::new(); for disk in &saved_state.nvme_disks { let pci_id = disk.pci_id.clone(); - - let dma_client = self.dma_client_spawner.new_client(DmaClientParameters { - device_name: format!("nvme_{}", pci_id), - lower_vtl_policy: LowerVtlPermissionPolicy::Any, - allocation_visibility: if self.is_isolated { - AllocationVisibility::Shared - } else { - AllocationVisibility::Private - }, - persistent_allocations: true, - })?; + let dma_client = self.get_dma_client(pci_id.clone())?; // This code can wait on each VFIO device until it is arrived. // A potential optimization would be to delay VFIO operation // until it is ready, but a redesign of VfioDevice is needed. - let vfio_device = VfioDevice::restore( - &self.driver_source, - &disk.pci_id.clone(), - true, - dma_client, - None, - ) - .instrument(tracing::info_span!("vfio_device_restore", pci_id)) - .await?; + let vfio_device = + VfioDevice::restore(&self.driver_source, &disk.pci_id.clone(), true, dma_client) + .instrument(tracing::info_span!("vfio_device_restore", pci_id)) + .await?; let nvme_driver = nvme_driver::NvmeDriver::restore( &self.driver_source, diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs index 3913e8e657..2fb8e487af 100644 --- a/openhcl/underhill_core/src/worker.rs +++ b/openhcl/underhill_core/src/worker.rs @@ -745,16 +745,19 @@ impl UhVmNetworkSettings { .unwrap_or(MAX_SUBCHANNELS_PER_VNIC) .min(vps_count as u16); - let dma_client = dma_client_spawner.new_client(DmaClientParameters { - device_name: format!("nic_{}", nic_config.pci_id), - lower_vtl_policy: LowerVtlPermissionPolicy::Any, - allocation_visibility: if is_isolated { - AllocationVisibility::Shared - } else { - AllocationVisibility::Private + let dma_client = dma_client_spawner.new_client( + DmaClientParameters { + device_name: format!("nic_{}", nic_config.pci_id), + lower_vtl_policy: LowerVtlPermissionPolicy::Any, + allocation_visibility: if is_isolated { + AllocationVisibility::Shared + } else { + AllocationVisibility::Private + }, + persistent_allocations: false, }, - persistent_allocations: false, - })?; + None, + )?; let (vf_manager, endpoints, save_state) = HclNetworkVFManager::new( nic_config.instance_id, @@ -1556,16 +1559,19 @@ async fn new_underhill_vm( if !matches!(isolation, virt::IsolationType::Vbs) { get_client.set_gpa_allocator( dma_manager - .new_client(DmaClientParameters { - device_name: "get".into(), - lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0, - allocation_visibility: if isolation.is_isolated() { - AllocationVisibility::Shared - } else { - AllocationVisibility::Private + .new_client( + DmaClientParameters { + device_name: "get".into(), + lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0, + allocation_visibility: if isolation.is_isolated() { + AllocationVisibility::Shared + } else { + AllocationVisibility::Private + }, + persistent_allocations: false, }, - persistent_allocations: false, - }) + None, + ) .context("get dma client")?, ); } @@ -1720,18 +1726,24 @@ async fn new_underhill_vm( Some(virt_mshv_vtl::CvmLateParams { shared_gm: cvm_mem.shared_gm.clone(), isolated_memory_protector: cvm_mem.protector.clone(), - shared_dma_client: dma_manager.new_client(DmaClientParameters { - device_name: "partition-shared".into(), - lower_vtl_policy: LowerVtlPermissionPolicy::Any, - allocation_visibility: AllocationVisibility::Shared, - persistent_allocations: false, - })?, - private_dma_client: dma_manager.new_client(DmaClientParameters { - device_name: "partition-private".into(), - lower_vtl_policy: LowerVtlPermissionPolicy::Any, - allocation_visibility: AllocationVisibility::Private, - persistent_allocations: false, - })?, + shared_dma_client: dma_manager.new_client( + DmaClientParameters { + device_name: "partition-shared".into(), + lower_vtl_policy: LowerVtlPermissionPolicy::Any, + allocation_visibility: AllocationVisibility::Shared, + persistent_allocations: false, + }, + None, + )?, + private_dma_client: dma_manager.new_client( + DmaClientParameters { + device_name: "partition-private".into(), + lower_vtl_policy: LowerVtlPermissionPolicy::Any, + allocation_visibility: AllocationVisibility::Private, + persistent_allocations: false, + }, + None, + )?, }) } else { None @@ -2860,12 +2872,15 @@ async fn new_underhill_vm( let shutdown_guest = SimpleVmbusClientDeviceWrapper::new( driver_source.simple(), dma_manager - .new_client(DmaClientParameters { - device_name: "shutdown-relay".into(), - lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0, - allocation_visibility: AllocationVisibility::Private, - persistent_allocations: false, - }) + .new_client( + DmaClientParameters { + device_name: "shutdown-relay".into(), + lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0, + allocation_visibility: AllocationVisibility::Private, + persistent_allocations: false, + }, + None, + ) .context("shutdown relay dma client")?, shutdown_guest, )?; diff --git a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs index 0e8e5c2b29..1463767ebb 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs @@ -53,11 +53,6 @@ impl Dev self.device.dma_client() } - // Not implemented for Fuzzer yet. - fn fallback_dma_client(&self) -> Option> { - None - } - /// Arbitrarily decide to passthrough or return arbitrary value. fn max_interrupt_count(&self) -> u32 { // Case: Fuzz response diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs index de94ccc423..66bbe00994 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs @@ -34,6 +34,7 @@ use thiserror::Error; use tracing::Instrument; use tracing::info_span; use user_driver::DeviceBacking; +use user_driver::DmaClientAllocStats; use user_driver::backoff::Backoff; use user_driver::interrupt::DeviceInterrupt; use user_driver::memory::MemoryBlock; @@ -92,8 +93,6 @@ struct WorkerState { qsize: u16, #[inspect(skip)] async_event_task: Task<()>, - /// Tracks how much memory was allocated using fallback allocator. - fallback_alloc: u64, } /// An error restoring from saved state. @@ -164,7 +163,7 @@ enum NvmeWorkerRequest { /// Save worker state. Save(Rpc<(), anyhow::Result>), /// Query how much memory was allocated with fallback allocator. - QueryAllocatorFallback(Rpc<(), u64>), + QueryAllocatorStats(Rpc<(), DmaClientAllocStats>), } impl NvmeDriver { @@ -292,11 +291,6 @@ impl NvmeDriver { ) .context("failed to create admin queue pair")?; - let fallback_alloc = admin.fallback_alloc(); - if fallback_alloc > 0 { - // Unconditionally override keepalive if fallback allocator was involved. - self.nvme_keepalive = false; - } let admin = worker.admin.insert(admin); // Register the admin queue with the controller. @@ -436,7 +430,6 @@ impl NvmeDriver { qsize, async_event_task, max_io_queues, - fallback_alloc, // May be updated in create_io_queue() below. }; self.admin = Some(admin.issuer().clone()); @@ -645,7 +638,6 @@ impl NvmeDriver { qsize: saved_state.worker_data.qsize, async_event_task, max_io_queues: saved_state.worker_data.max_io_queues, - fallback_alloc: 0, // This is restore() which implies that everything was saved. }; this.admin = Some(admin.issuer().clone()); @@ -706,13 +698,16 @@ impl NvmeDriver { } /// Queries worker task if memory allocator ever fell back. - pub async fn query_fallback_alloc(&self) -> u64 { + pub async fn get_alloc_stats(&self) -> DmaClientAllocStats { let fb = self .io_issuers .send - .call(NvmeWorkerRequest::QueryAllocatorFallback, ()) + .call(NvmeWorkerRequest::QueryAllocatorStats, ()) .await; - fb.unwrap_or_default() + fb.unwrap_or(DmaClientAllocStats { + total_alloc: 0, + fallback_alloc: 0, + }) } } @@ -813,8 +808,11 @@ impl AsyncRun for DriverWorkerTask { Some(NvmeWorkerRequest::Save(rpc)) => { rpc.handle(async |_| self.save(state).await).await } - Some(NvmeWorkerRequest::QueryAllocatorFallback(rpc)) => { - rpc.complete(state.fallback_alloc) + Some(NvmeWorkerRequest::QueryAllocatorStats(rpc)) => { + rpc.complete(DmaClientAllocStats { + total_alloc: self.device.dma_client().alloc_size(), + fallback_alloc: self.device.dma_client().fallback_alloc_size(), + }) } None => break, } @@ -892,8 +890,6 @@ impl DriverWorkerTask { self.registers.clone(), ) .with_context(|| format!("failed to create io queue pair {qid}"))?; - // Keep tracking if fallback allocator was ever used. - state.fallback_alloc += queue.fallback_alloc(); let io_sq_addr = queue.sq_addr(); let io_cq_addr = queue.cq_addr(); diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs index 4f283747ef..8b11e06a57 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs @@ -45,13 +45,6 @@ use zerocopy::FromZeros; /// Value for unused PRP entries, to catch/mitigate buffer size mismatches. const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1); -/// Memory allocator errors. -#[derive(Debug, Error)] -pub enum AllocatorError { - #[error("no fallback mem allocator")] - NoFallbackAllocator, -} - pub(crate) struct QueuePair { task: Task, cancel: Cancel, @@ -60,7 +53,6 @@ pub(crate) struct QueuePair { qid: u16, sq_entries: u16, cq_entries: u16, - fallback_alloc: u64, } impl Inspect for QueuePair { @@ -73,7 +65,6 @@ impl Inspect for QueuePair { qid: _, sq_entries: _, cq_entries: _, - fallback_alloc: _, } = self; issuer.send.send(Req::Inspect(req.defer())); } @@ -193,34 +184,15 @@ impl QueuePair { assert!(sq_entries <= Self::MAX_SQ_ENTRIES); assert!(cq_entries <= Self::MAX_CQ_ENTRIES); - let mut fallback_alloc = 0u64; let total_size = QueuePair::SQ_SIZE + QueuePair::CQ_SIZE + QueuePair::PER_QUEUE_PAGES * PAGE_SIZE; let dma_client = device.dma_client(); let mem = dma_client .allocate_dma_buffer(total_size) - .context("failed to allocate memory for the queues") - .or_else(|_| { - fallback_alloc += total_size as u64; - device.fallback_dma_client().map_or( - Err(AllocatorError::NoFallbackAllocator.into()), - |f| { - f.allocate_dma_buffer(total_size) - .context("fallback mem allocator also failed") - }, - ) - }); + .context("failed to allocate memory for the queues")?; QueuePair::new_or_restore( - spawner, - qid, - sq_entries, - cq_entries, - interrupt, - registers, - mem?, - None, - fallback_alloc, + spawner, qid, sq_entries, cq_entries, interrupt, registers, mem, None, ) } @@ -234,7 +206,6 @@ impl QueuePair { registers: Arc>, mem: MemoryBlock, saved_state: Option<&QueueHandlerSavedState>, - fallback_alloc: u64, ) -> anyhow::Result { // MemoryBlock is either allocated or restored prior calling here. let sq_mem_block = mem.subblock(0, QueuePair::SQ_SIZE); @@ -284,7 +255,6 @@ impl QueuePair { qid, sq_entries, cq_entries, - fallback_alloc, }) } @@ -305,11 +275,6 @@ impl QueuePair { self.task.await } - /// Indicates that fallback allocator was used for this queue pair. - pub fn fallback_alloc(&self) -> u64 { - self.fallback_alloc - } - /// Save queue pair state for servicing. pub async fn save(&self) -> anyhow::Result { // Return error if the queue does not have any memory allocated. @@ -356,7 +321,6 @@ impl QueuePair { registers, mem, Some(handler_data), - 0u64, ) } } diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs index 16b827c46f..afaef105d7 100644 --- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs +++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs @@ -366,11 +366,6 @@ impl Dev self.device.dma_client() } - // TODO: We should add one for testing. - fn fallback_dma_client(&self) -> Option> { - None - } - fn max_interrupt_count(&self) -> u32 { self.device.max_interrupt_count() } diff --git a/vm/devices/user_driver/src/lib.rs b/vm/devices/user_driver/src/lib.rs index 1461c382e8..7724fdd2bb 100644 --- a/vm/devices/user_driver/src/lib.rs +++ b/vm/devices/user_driver/src/lib.rs @@ -33,9 +33,6 @@ pub trait DeviceBacking: 'static + Send + Inspect { /// DMA Client for the device. fn dma_client(&self) -> Arc; - /// Fallback DMA Client for the device. - fn fallback_dma_client(&self) -> Option>; - /// Returns the maximum number of interrupts that can be mapped. fn max_interrupt_count(&self) -> u32; @@ -72,4 +69,21 @@ pub trait DmaClient: Send + Sync + Inspect { /// Attach all previously allocated memory blocks. fn attach_pending_buffers(&self) -> anyhow::Result>; + + /// Query if this client supports persistent allocations. + fn is_persistent(&self) -> bool; + + /// How much memory was allocated during session. + fn alloc_size(&self) -> u64; + + /// How much backup memory was allocated during session (fallback). + fn fallback_alloc_size(&self) -> u64; +} + +/// DMA allocator statistics per client. +pub struct DmaClientAllocStats { + /// How much memory (bytes) was allocated by a main allocator. + pub total_alloc: u64, + /// How much memory (bytes) was allocated by a fallback allocator. + pub fallback_alloc: u64, } diff --git a/vm/devices/user_driver/src/lockmem.rs b/vm/devices/user_driver/src/lockmem.rs index b7cb190630..154fb90fb6 100644 --- a/vm/devices/user_driver/src/lockmem.rs +++ b/vm/devices/user_driver/src/lockmem.rs @@ -134,4 +134,19 @@ impl crate::DmaClient for LockedMemorySpawner { fn attach_pending_buffers(&self) -> anyhow::Result> { anyhow::bail!("restore not supported for lockmem") } + + /// Query if this client supports persistent allocations. + fn is_persistent(&self) -> bool { + false + } + + /// How much memory was allocated during session. + fn alloc_size(&self) -> u64 { + 0 + } + + /// How much backup memory was allocated during session (fallback). + fn fallback_alloc_size(&self) -> u64 { + 0 + } } diff --git a/vm/devices/user_driver/src/vfio.rs b/vm/devices/user_driver/src/vfio.rs index 137d1db334..4151c56079 100644 --- a/vm/devices/user_driver/src/vfio.rs +++ b/vm/devices/user_driver/src/vfio.rs @@ -57,7 +57,6 @@ pub struct VfioDevice { #[inspect(skip)] config_space: vfio_sys::RegionInfo, dma_client: Arc, - fallback_dma_client: Option>, } #[derive(Inspect)] @@ -75,16 +74,8 @@ impl VfioDevice { driver_source: &VmTaskDriverSource, pci_id: &str, dma_client: Arc, - fallback_dma_client: Option>, ) -> anyhow::Result { - Self::restore( - driver_source, - pci_id, - false, - dma_client, - fallback_dma_client, - ) - .await + Self::restore(driver_source, pci_id, false, dma_client).await } /// Creates a new VFIO-backed device for the PCI device with `pci_id`. @@ -94,7 +85,6 @@ impl VfioDevice { pci_id: &str, vf_keepalive: bool, dma_client: Arc, - fallback_dma_client: Option>, ) -> anyhow::Result { let path = Path::new("/sys/bus/pci/devices").join(pci_id); @@ -141,7 +131,6 @@ impl VfioDevice { driver_source: driver_source.clone(), interrupts: Vec::new(), dma_client, - fallback_dma_client, }; // Ensure bus master enable and memory space enable are set, and that @@ -246,10 +235,6 @@ impl DeviceBacking for VfioDevice { self.dma_client.clone() } - fn fallback_dma_client(&self) -> Option> { - self.fallback_dma_client.clone() - } - fn max_interrupt_count(&self) -> u32 { self.msix_info.count } diff --git a/vm/devices/user_driver_emulated_mock/src/lib.rs b/vm/devices/user_driver_emulated_mock/src/lib.rs index 0ed650d957..fb68810cd5 100644 --- a/vm/devices/user_driver_emulated_mock/src/lib.rs +++ b/vm/devices/user_driver_emulated_mock/src/lib.rs @@ -162,11 +162,6 @@ impl Option> { - None - } - fn max_interrupt_count(&self) -> u32 { self.controller.events.len() as u32 } diff --git a/vm/page_pool_alloc/src/lib.rs b/vm/page_pool_alloc/src/lib.rs index c55ba44655..352c87cee7 100644 --- a/vm/page_pool_alloc/src/lib.rs +++ b/vm/page_pool_alloc/src/lib.rs @@ -657,6 +657,8 @@ pub struct PagePoolAllocator { inner: Arc, #[inspect(skip)] device_id: usize, + /// Total alloc size in bytes for the duration. + alloc_size: Mutex, } impl PagePoolAllocator { @@ -695,6 +697,7 @@ impl PagePoolAllocator { Ok(Self { inner: inner.clone(), device_id, + alloc_size: Mutex::new(0), }) } @@ -867,7 +870,7 @@ impl user_driver::DmaClient for PagePoolAllocator { let alloc = self .alloc(size_pages, "vfio dma".into()) - .context("failed to allocate shared mem")?; + .context("failed to allocate from page pool")?; // The VfioDmaBuffer trait requires that newly allocated buffers are // zeroed. @@ -883,6 +886,21 @@ impl user_driver::DmaClient for PagePoolAllocator { .map(|alloc| alloc.into_memory_block()) .collect() } + + /// Query if this client supports persistent allocations. + fn is_persistent(&self) -> bool { + true + } + + /// How much memory was allocated during session. + fn alloc_size(&self) -> u64 { + 0 + } + + /// How much backup memory was allocated during session (fallback). + fn fallback_alloc_size(&self) -> u64 { + 0 + } } #[cfg(test)] From 727fd7dabd6fb6e00968f0a8291ed5a7e3353298 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Fri, 25 Apr 2025 16:35:11 +0000 Subject: [PATCH 16/18] Tracking and inspecting improvements --- Cargo.lock | 1 + .../lower_vtl_permissions_guard/Cargo.toml | 1 + .../lower_vtl_permissions_guard/src/lib.rs | 8 +++++-- openhcl/openhcl_dma_manager/src/lib.rs | 5 ++-- vm/devices/user_driver/src/lockmem.rs | 24 +++++++++++++++---- vm/page_pool_alloc/src/lib.rs | 7 +++--- 6 files changed, 34 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab0f6572ad..93fd5aed9b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3808,6 +3808,7 @@ dependencies = [ "anyhow", "hvdef", "inspect", + "parking_lot", "user_driver", "virt", ] diff --git a/openhcl/lower_vtl_permissions_guard/Cargo.toml b/openhcl/lower_vtl_permissions_guard/Cargo.toml index d4e2b17496..809158724d 100644 --- a/openhcl/lower_vtl_permissions_guard/Cargo.toml +++ b/openhcl/lower_vtl_permissions_guard/Cargo.toml @@ -9,6 +9,7 @@ rust-version.workspace = true [target.'cfg(target_os = "linux")'.dependencies] hvdef.workspace = true inspect.workspace = true +parking_lot.workspace = true user_driver.workspace = true virt.workspace = true diff --git a/openhcl/lower_vtl_permissions_guard/src/lib.rs b/openhcl/lower_vtl_permissions_guard/src/lib.rs index 1b2c08f048..616267cbba 100644 --- a/openhcl/lower_vtl_permissions_guard/src/lib.rs +++ b/openhcl/lower_vtl_permissions_guard/src/lib.rs @@ -13,6 +13,7 @@ pub use device_dma::LowerVtlDmaBuffer; use anyhow::Context; use anyhow::Result; use inspect::Inspect; +use parking_lot::Mutex; use std::sync::Arc; use user_driver::DmaClient; use user_driver::memory::MemoryBlock; @@ -79,6 +80,7 @@ pub struct LowerVtlMemorySpawner { spawner: T, #[inspect(skip)] vtl_protect: Arc, + alloc_size: Mutex, } impl LowerVtlMemorySpawner { @@ -88,6 +90,7 @@ impl LowerVtlMemorySpawner { Self { spawner, vtl_protect, + alloc_size: Mutex::new(0), } } } @@ -98,6 +101,7 @@ impl DmaClient for LowerVtlMemorySpawner { let vtl_guard = PagesAccessibleToLowerVtl::new_from_pages(self.vtl_protect.clone(), mem.pfns()) .context("failed to lower VTL permissions on memory block")?; + *self.alloc_size.lock() += len as u64; Ok(MemoryBlock::new(LowerVtlDmaBuffer { block: mem, @@ -116,10 +120,10 @@ impl DmaClient for LowerVtlMemorySpawner { /// How much memory was allocated during session. fn alloc_size(&self) -> u64 { - 0 + *self.alloc_size.lock() } - /// How much backup memory was allocated during session (fallback). + /// Not supported for this allocator. fn fallback_alloc_size(&self) -> u64 { 0 } diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs index 819817edc6..3b371cde45 100644 --- a/openhcl/openhcl_dma_manager/src/lib.rs +++ b/openhcl/openhcl_dma_manager/src/lib.rs @@ -298,13 +298,13 @@ impl DmaManagerInner { LowerVtlPermissionPolicy::Any => { // No persistence needed means the `LockedMemorySpawner` // using normal VTL2 ram is fine. - DmaClientBacking::LockedMemory(LockedMemorySpawner) + DmaClientBacking::LockedMemory(LockedMemorySpawner::new()) } LowerVtlPermissionPolicy::Vtl0 => { // `LockedMemorySpawner` uses private VTL2 ram, so // lowering VTL permissions is required. DmaClientBacking::LockedMemoryLowerVtl(LowerVtlMemorySpawner::new( - LockedMemorySpawner, + LockedMemorySpawner::new(), self.lower_vtl.clone(), )) } @@ -499,6 +499,7 @@ impl DmaClient for OpenhclDmaClient { &self, total_size: usize, ) -> anyhow::Result { + // The stats must be tracked here, not in the backing. let mut stats = self.inner_stats.lock(); stats.total_alloc += total_size as u64; let mem_block = self.backing.allocate_dma_buffer(total_size).or_else(|_| { diff --git a/vm/devices/user_driver/src/lockmem.rs b/vm/devices/user_driver/src/lockmem.rs index 154fb90fb6..aa9fee6bf6 100644 --- a/vm/devices/user_driver/src/lockmem.rs +++ b/vm/devices/user_driver/src/lockmem.rs @@ -6,6 +6,7 @@ use crate::memory::MappedDmaTarget; use anyhow::Context; use inspect::Inspect; +use parking_lot::Mutex; use std::ffi::c_void; use std::fs::File; use std::io::Read; @@ -123,12 +124,25 @@ unsafe impl MappedDmaTarget for LockedMemory { } } -#[derive(Clone, Inspect)] -pub struct LockedMemorySpawner; +#[derive(Inspect)] +pub struct LockedMemorySpawner { + alloc_size: Mutex, +} + +impl LockedMemorySpawner { + /// Create a new [`LockedMemorySpawner`]. + pub fn new() -> Self { + Self { + alloc_size: Mutex::new(0), + } + } +} impl crate::DmaClient for LockedMemorySpawner { fn allocate_dma_buffer(&self, len: usize) -> anyhow::Result { - Ok(crate::memory::MemoryBlock::new(LockedMemory::new(len)?)) + let mem_block = crate::memory::MemoryBlock::new(LockedMemory::new(len)?); + *self.alloc_size.lock() += len as u64; + Ok(mem_block) } fn attach_pending_buffers(&self) -> anyhow::Result> { @@ -142,10 +156,10 @@ impl crate::DmaClient for LockedMemorySpawner { /// How much memory was allocated during session. fn alloc_size(&self) -> u64 { - 0 + *self.alloc_size.lock() } - /// How much backup memory was allocated during session (fallback). + /// Not supported for this allocator. fn fallback_alloc_size(&self) -> u64 { 0 } diff --git a/vm/page_pool_alloc/src/lib.rs b/vm/page_pool_alloc/src/lib.rs index 352c87cee7..8abaca0562 100644 --- a/vm/page_pool_alloc/src/lib.rs +++ b/vm/page_pool_alloc/src/lib.rs @@ -657,7 +657,7 @@ pub struct PagePoolAllocator { inner: Arc, #[inspect(skip)] device_id: usize, - /// Total alloc size in bytes for the duration. + /// Total alloc size in bytes for the session duration. alloc_size: Mutex, } @@ -871,6 +871,7 @@ impl user_driver::DmaClient for PagePoolAllocator { let alloc = self .alloc(size_pages, "vfio dma".into()) .context("failed to allocate from page pool")?; + *self.alloc_size.lock() += len as u64; // The VfioDmaBuffer trait requires that newly allocated buffers are // zeroed. @@ -894,10 +895,10 @@ impl user_driver::DmaClient for PagePoolAllocator { /// How much memory was allocated during session. fn alloc_size(&self) -> u64 { - 0 + *self.alloc_size.lock() } - /// How much backup memory was allocated during session (fallback). + /// Not supported for this allocator. fn fallback_alloc_size(&self) -> u64 { 0 } From 09efefbafe7a31188865319ec3473debac88cad1 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Fri, 25 Apr 2025 20:08:40 +0000 Subject: [PATCH 17/18] Inform openhcl about self hint and print it --- openhcl/bootloader_fdt_parser/src/lib.rs | 14 ++++++++++++++ openhcl/openhcl_boot/src/dt.rs | 5 +++++ openhcl/openhcl_boot/src/host_params/dt.rs | 5 +++++ openhcl/openhcl_boot/src/host_params/mod.rs | 3 +++ openhcl/openhcl_boot/src/main.rs | 1 + openhcl/openhcl_dma_manager/src/lib.rs | 14 ++++++++++++++ openhcl/underhill_core/src/worker.rs | 8 ++++++++ vm/page_pool_alloc/src/lib.rs | 7 +++++++ 8 files changed, 57 insertions(+) diff --git a/openhcl/bootloader_fdt_parser/src/lib.rs b/openhcl/bootloader_fdt_parser/src/lib.rs index 2b056ca72f..b6c04f6794 100644 --- a/openhcl/bootloader_fdt_parser/src/lib.rs +++ b/openhcl/bootloader_fdt_parser/src/lib.rs @@ -169,6 +169,8 @@ pub struct ParsedBootDtInfo { /// VTL2 range for private pool memory. #[inspect(iter_by_index)] pub private_pool_ranges: Vec, + /// Source of DMA hint calculation. + pub dma_hint_self: bool, } fn err_to_owned(e: fdt::parser::Error<'_>) -> anyhow::Error { @@ -207,6 +209,7 @@ struct OpenhclInfo { memory_allocation_mode: MemoryAllocationMode, isolation: IsolationType, private_pool_ranges: Vec, + dma_hint_self: bool, } fn parse_memory_openhcl(node: &Node<'_>) -> anyhow::Result { @@ -394,6 +397,11 @@ fn parse_openhcl(node: &Node<'_>) -> anyhow::Result { .transpose() .context("unable to read vtl0-alias-map")?; + let dma_hint_self = matches!( + try_find_property(node, "dma-hint").and_then(|p| p.read_str().ok()), + Some("self") + ); + // Extract vmbus mmio information from the overall memory map. let vtl0_mmio = memory .iter() @@ -416,6 +424,7 @@ fn parse_openhcl(node: &Node<'_>) -> anyhow::Result { memory_allocation_mode, isolation, private_pool_ranges, + dma_hint_self, }) } @@ -509,6 +518,7 @@ impl ParsedBootDtInfo { let mut isolation = IsolationType::None; let mut vtl2_reserved_range = MemoryRange::EMPTY; let mut private_pool_ranges = Vec::new(); + let mut dma_hint_self = false; let parser = Parser::new(raw) .map_err(err_to_owned) @@ -538,6 +548,7 @@ impl ParsedBootDtInfo { memory_allocation_mode: n_memory_allocation_mode, isolation: n_isolation, private_pool_ranges: n_private_pool_ranges, + dma_hint_self: n_dma_hint_self, } = parse_openhcl(&child)?; vtl0_mmio = n_vtl0_mmio; config_ranges = n_config_ranges; @@ -548,6 +559,7 @@ impl ParsedBootDtInfo { isolation = n_isolation; vtl2_reserved_range = n_vtl2_reserved_range; private_pool_ranges = n_private_pool_ranges; + dma_hint_self = n_dma_hint_self; } _ if child.name.starts_with("memory@") => { @@ -580,6 +592,7 @@ impl ParsedBootDtInfo { isolation, vtl2_reserved_range, private_pool_ranges, + dma_hint_self, }) } } @@ -945,6 +958,7 @@ mod tests { range: MemoryRange::new(0x60000..0x70000), vnode: 0, }], + dma_hint_self: false, }; let dt = build_dt(&orig_info).unwrap(); diff --git a/openhcl/openhcl_boot/src/dt.rs b/openhcl/openhcl_boot/src/dt.rs index 3a85058c9c..40b16536df 100644 --- a/openhcl/openhcl_boot/src/dt.rs +++ b/openhcl/openhcl_boot/src/dt.rs @@ -483,6 +483,11 @@ pub fn write_dt( openhcl_builder = openhcl_builder.add_u64(p_vtl0_alias_map, data)?; } + if partition_info.dma_hint_self { + let p_dma_hint = openhcl_builder.add_string("dma-hint")?; + openhcl_builder = openhcl_builder.add_str(p_dma_hint, "self")?; + } + #[derive(Debug, Copy, Clone, PartialEq, Eq)] struct Vtl2MemoryEntry { range: MemoryRange, diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs index fbadf47feb..de72ea9676 100644 --- a/openhcl/openhcl_boot/src/host_params/dt.rs +++ b/openhcl/openhcl_boot/src/host_params/dt.rs @@ -456,6 +456,7 @@ impl PartitionInfo { .vtl2_used_ranges .extend(flatten_ranges(used_ranges.iter().copied())); + let mut vtl2_dma_hint_self = false; // Decide if we will reserve memory for a VTL2 private pool. Parse this // from the final command line, or the host provided device tree value. let vtl2_gpa_pool_size = { @@ -472,6 +473,7 @@ impl PartitionInfo { { // If host did not provide the DMA hint value, re-evaluate // it internally if conditions satisfy. + vtl2_dma_hint_self = true; vtl2_calculate_dma_hint(parsed.cpu_count(), storage) } else { hostval @@ -512,6 +514,7 @@ impl PartitionInfo { .extend(flatten_ranges(used_ranges.iter().copied())); storage.vtl2_pool_memory = pool; + storage.dma_hint_self = vtl2_dma_hint_self; } // If we can trust the host, use the provided alias map @@ -540,6 +543,7 @@ impl PartitionInfo { entropy, vtl0_alias_map: _, nvme_keepalive, + dma_hint_self, } = storage; assert!(!vtl2_used_ranges.is_empty()); @@ -562,6 +566,7 @@ impl PartitionInfo { *gic = parsed.gic.clone(); *entropy = parsed.entropy.clone(); *nvme_keepalive = parsed.nvme_keepalive; + *dma_hint_self = vtl2_dma_hint_self; Ok(Some(storage)) } diff --git a/openhcl/openhcl_boot/src/host_params/mod.rs b/openhcl/openhcl_boot/src/host_params/mod.rs index 05b087f011..b53652715a 100644 --- a/openhcl/openhcl_boot/src/host_params/mod.rs +++ b/openhcl/openhcl_boot/src/host_params/mod.rs @@ -95,6 +95,8 @@ pub struct PartitionInfo { pub vtl0_alias_map: Option, /// Host is compatible with DMA preservation / NVMe keep-alive. pub nvme_keepalive: bool, + /// DMA hint was calculated in boot-shim instead of host. + pub dma_hint_self: bool, } impl PartitionInfo { @@ -126,6 +128,7 @@ impl PartitionInfo { entropy: None, vtl0_alias_map: None, nvme_keepalive: false, + dma_hint_self: false, } } diff --git a/openhcl/openhcl_boot/src/main.rs b/openhcl/openhcl_boot/src/main.rs index d2cfce8e74..648f7f5be0 100644 --- a/openhcl/openhcl_boot/src/main.rs +++ b/openhcl/openhcl_boot/src/main.rs @@ -956,6 +956,7 @@ mod test { entropy: None, vtl0_alias_map: None, nvme_keepalive: false, + dma_hint_self: false, } } diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs index 3b371cde45..76ea8c2fff 100644 --- a/openhcl/openhcl_dma_manager/src/lib.rs +++ b/openhcl/openhcl_dma_manager/src/lib.rs @@ -407,6 +407,20 @@ impl OpenhclDmaManager { Ok(()) } + + /// Return shared pool size in bytes. + pub fn shared_pool_size(&self) -> u64 { + self.shared_pool + .as_ref() + .map_or(0, |pool| pool.total_size()) + } + + /// Return private pool size in bytes. + pub fn private_pool_size(&self) -> u64 { + self.private_pool + .as_ref() + .map_or(0, |pool| pool.total_size()) + } } /// A spawner for creating DMA clients. diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs index 2fb8e487af..2c1051941a 100644 --- a/openhcl/underhill_core/src/worker.rs +++ b/openhcl/underhill_core/src/worker.rs @@ -1518,6 +1518,14 @@ async fn new_underhill_vm( .context("failed to restore global dma manager")?; } + // Print important info about DMA sizes. + tracing::info!( + dma_hint_self = boot_info.dma_hint_self, + shared_pool_size = shared_pool_size, + private_pool_size = dma_manager.private_pool_size(), + "dma pool" + ); + // Test with the highest VTL for which we have a GuestMemory object let highest_vtl_gm = gm.vtl1().unwrap_or(gm.vtl0()); diff --git a/vm/page_pool_alloc/src/lib.rs b/vm/page_pool_alloc/src/lib.rs index 8abaca0562..69234c2484 100644 --- a/vm/page_pool_alloc/src/lib.rs +++ b/vm/page_pool_alloc/src/lib.rs @@ -504,6 +504,7 @@ pub struct PagePool { inner: Arc, #[inspect(iter_by_index)] ranges: Vec, + total_len: u64, } impl PagePool { @@ -557,6 +558,7 @@ impl PagePool { mapping, }), ranges: memory.to_vec(), + total_len: total_len as u64, }) } @@ -621,6 +623,11 @@ impl PagePool { Ok(()) } } + + /// Returns the total size of the pool in bytes. + pub fn total_size(&self) -> u64 { + self.total_len + } } /// A spawner for [`PagePoolAllocator`] instances. From 1c34003b3a3c0f81cf6cb7c94a0d894f4b0e4766 Mon Sep 17 00:00:00 2001 From: Yuri Pavlenko Date: Mon, 28 Apr 2025 13:52:56 -0700 Subject: [PATCH 18/18] Swapped one row --- openhcl/openhcl_boot/src/host_params/dma_hint.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs index e0925a74e9..6208a95602 100644 --- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs +++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs @@ -27,7 +27,7 @@ const LOOKUP_TABLE: &[(u16, u16, u16)] = &[ (8, 170, 20), (8, 176, 20), (16, 234, 12), - (16, 256, 18), // There is another configuration with '20'. + (16, 256, 20), // There is another configuration with '18'. (16, 268, 38), (16, 282, 54), (24, 420, 66), @@ -50,7 +50,7 @@ const LOOKUP_TABLE: &[(u16, u16, u16)] = &[ (112, 1566, 288), (128, 1342, 84), (128, 1360, 84), - // (896, 12912, 516), // Needs to be validated as the vNIC number is unknown. + (896, 12912, 0), // (516) Needs to be validated as the vNIC number is unknown. ]; /// Round up to next 2MiB.