From e56fecca8a8c7de417ba474e4d8ce7e44b311609 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Wed, 9 Apr 2025 17:59:58 -0700
Subject: [PATCH 01/18] Thinking where to add it

---
 openhcl/openhcl_boot/src/host_params/dma_hint.rs |  8 ++++++++
 openhcl/openhcl_boot/src/host_params/dt.rs       | 16 +++++++++++++++-
 openhcl/openhcl_boot/src/host_params/mod.rs      |  1 +
 3 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 openhcl/openhcl_boot/src/host_params/dma_hint.rs

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
new file mode 100644
index 0000000000..e06ac05612
--- /dev/null
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -0,0 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! Calculate DMA hint value if not provided by host.
+
+pub fn vtl2_calculate_dma_hint() -> u64 {
+    16384
+}
diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs
index 1ae52e24c0..21c72808b2 100644
--- a/openhcl/openhcl_boot/src/host_params/dt.rs
+++ b/openhcl/openhcl_boot/src/host_params/dt.rs
@@ -13,6 +13,7 @@ use crate::host_params::MAX_ENTROPY_SIZE;
 use crate::host_params::MAX_NUMA_NODES;
 use crate::host_params::MAX_PARTITION_RAM_RANGES;
 use crate::host_params::MAX_VTL2_USED_RANGES;
+use crate::host_params::dma_hint::vtl2_calculate_dma_hint;
 use crate::single_threaded::OffStackRef;
 use crate::single_threaded::off_stack;
 use arrayvec::ArrayVec;
@@ -337,6 +338,7 @@ impl PartitionInfo {
         let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
 
         let command_line = params.command_line();
+        log!("YSP: device_dma_page_count {}", parsed.device_dma_page_count.unwrap_or(0));
 
         // Always write the measured command line.
         write!(
@@ -358,6 +360,7 @@ impl PartitionInfo {
 
         match parsed.memory_allocation_mode {
             MemoryAllocationMode::Host => {
+                log!("YSP: MemoryAllocationMode::Host");
                 storage.vtl2_ram.clear();
                 storage
                     .vtl2_ram
@@ -369,6 +372,7 @@ impl PartitionInfo {
                 memory_size,
                 mmio_size,
             } => {
+                log!("YSP: MemoryAllocationMode::Vtl2");
                 storage.vtl2_ram.clear();
                 storage
                     .vtl2_ram
@@ -457,7 +461,7 @@ impl PartitionInfo {
 
         // Decide if we will reserve memory for a VTL2 private pool. Parse this
         // from the final command line, or the host provided device tree value.
-        let vtl2_gpa_pool_size = {
+        let mut vtl2_gpa_pool_size = {
             let dt_page_count = parsed.device_dma_page_count;
             let cmdline_page_count =
                 crate::cmdline::parse_boot_command_line(storage.cmdline.as_str())
@@ -465,6 +469,15 @@ impl PartitionInfo {
 
             max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0))
         };
+        // If host did not provide the DMA hint value, re-evaluate
+        // it internally if conditions satisfy.
+        if vtl2_gpa_pool_size == 0 && parsed.nvme_keepalive {
+            let dma_hint = vtl2_calculate_dma_hint();
+            if dma_hint != 0 {
+                vtl2_gpa_pool_size = dma_hint;
+                //*parsed.device_dma_page_count = Some(dma_hint);
+            }
+        }
         if vtl2_gpa_pool_size != 0 {
             // Reserve the specified number of pages for the pool. Use the used
             // ranges to figure out which VTL2 memory is free to allocate from.
@@ -501,6 +514,7 @@ impl PartitionInfo {
 
             storage.vtl2_pool_memory = pool;
         }
+        log!("YSP: vtl2_gpa_pool_size {}", vtl2_gpa_pool_size);
 
         // If we can trust the host, use the provided alias map
         if can_trust_host {
diff --git a/openhcl/openhcl_boot/src/host_params/mod.rs b/openhcl/openhcl_boot/src/host_params/mod.rs
index ea9ac0d422..05b087f011 100644
--- a/openhcl/openhcl_boot/src/host_params/mod.rs
+++ b/openhcl/openhcl_boot/src/host_params/mod.rs
@@ -15,6 +15,7 @@ use memory_range::MemoryRange;
 use memory_range::subtract_ranges;
 use shim_params::IsolationType;
 
+mod dma_hint;
 mod dt;
 mod mmio;
 pub mod shim_params;

From 4391f6985e2e63f73f4875c9828e53397558cbe9 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Wed, 9 Apr 2025 18:11:34 -0700
Subject: [PATCH 02/18] Improved syntax

---
 openhcl/openhcl_boot/src/host_params/dt.rs | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs
index 21c72808b2..578cbd072b 100644
--- a/openhcl/openhcl_boot/src/host_params/dt.rs
+++ b/openhcl/openhcl_boot/src/host_params/dt.rs
@@ -461,23 +461,21 @@ impl PartitionInfo {
 
         // Decide if we will reserve memory for a VTL2 private pool. Parse this
         // from the final command line, or the host provided device tree value.
-        let mut vtl2_gpa_pool_size = {
+        let vtl2_gpa_pool_size = {
             let dt_page_count = parsed.device_dma_page_count;
             let cmdline_page_count =
                 crate::cmdline::parse_boot_command_line(storage.cmdline.as_str())
                     .enable_vtl2_gpa_pool;
 
-            max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0))
-        };
-        // If host did not provide the DMA hint value, re-evaluate
-        // it internally if conditions satisfy.
-        if vtl2_gpa_pool_size == 0 && parsed.nvme_keepalive {
-            let dma_hint = vtl2_calculate_dma_hint();
-            if dma_hint != 0 {
-                vtl2_gpa_pool_size = dma_hint;
-                //*parsed.device_dma_page_count = Some(dma_hint);
+            let hostval = max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0));
+            if hostval == 0 && parsed.nvme_keepalive {
+                // If host did not provide the DMA hint value, re-evaluate
+                // it internally if conditions satisfy.
+                vtl2_calculate_dma_hint()
+            } else {
+                hostval
             }
-        }
+        };
         if vtl2_gpa_pool_size != 0 {
             // Reserve the specified number of pages for the pool. Use the used
             // ranges to figure out which VTL2 memory is free to allocate from.

From ac049360bd1caba866f94e9c162a04b1cc02638b Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 10 Apr 2025 21:52:57 +0000
Subject: [PATCH 03/18] More experiments

---
 openhcl/openhcl_boot/src/dt.rs                   |  2 ++
 openhcl/openhcl_boot/src/host_params/dma_hint.rs | 14 +++++++++++++-
 openhcl/openhcl_boot/src/host_params/dt.rs       |  7 +++++--
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/openhcl/openhcl_boot/src/dt.rs b/openhcl/openhcl_boot/src/dt.rs
index 3a85058c9c..15393f0e70 100644
--- a/openhcl/openhcl_boot/src/dt.rs
+++ b/openhcl/openhcl_boot/src/dt.rs
@@ -9,6 +9,7 @@ use crate::ReservedMemoryType;
 use crate::host_params::COMMAND_LINE_SIZE;
 use crate::host_params::PartitionInfo;
 use crate::host_params::shim_params::IsolationType;
+use crate::log; // YSP
 use crate::sidecar::SidecarConfig;
 use crate::single_threaded::off_stack;
 use arrayvec::ArrayString;
@@ -211,6 +212,7 @@ pub fn write_dt(
     let p_enable_method = builder.add_string("enable-method")?;
 
     let num_cpus = partition_info.cpus.len();
+    log!("YSP: num_cpus2 = {}", num_cpus);
 
     let mut root_builder = builder
         .start_node("")?
diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index e06ac05612..8c74a32ddc 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -3,6 +3,18 @@
 
 //! Calculate DMA hint value if not provided by host.
 
-pub fn vtl2_calculate_dma_hint() -> u64 {
+use crate::log;
+use igvm_defs::MemoryMapEntryType;
+use super::PartitionInfo;
+
+pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 {
+    log!("YSP: vp_count = {}", vp_count);
+    let mem_size = storage
+        .vtl2_ram
+        .iter()
+        .filter(|m| m.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
+        .map(|e| e.range.len())
+        .sum::<u64>();
+    log!("YSP: mem_size = {}", mem_size);
     16384
 }
diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs
index 578cbd072b..886f50f307 100644
--- a/openhcl/openhcl_boot/src/host_params/dt.rs
+++ b/openhcl/openhcl_boot/src/host_params/dt.rs
@@ -468,10 +468,13 @@ impl PartitionInfo {
                     .enable_vtl2_gpa_pool;
 
             let hostval = max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0));
-            if hostval == 0 && parsed.nvme_keepalive {
+            if hostval == 0 &&
+                parsed.nvme_keepalive &&
+                params.isolation_type == IsolationType::None &&
+                storage.memory_allocation_mode == MemoryAllocationMode::Host {
                 // If host did not provide the DMA hint value, re-evaluate
                 // it internally if conditions satisfy.
-                vtl2_calculate_dma_hint()
+                vtl2_calculate_dma_hint(parsed.cpu_count(), storage)
             } else {
                 hostval
             }

From b45f919b4079bac5c2f3f724bdbd7848e0980988 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 10 Apr 2025 23:01:20 +0000
Subject: [PATCH 04/18] Added table lookup (incomplete table yet)

---
 .../openhcl_boot/src/host_params/dma_hint.rs  | 75 ++++++++++++++++++-
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index 8c74a32ddc..410cbf8772 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -7,7 +7,61 @@ use crate::log;
 use igvm_defs::MemoryMapEntryType;
 use super::PartitionInfo;
 
+struct DmaLookupStruct {
+    /// Logical processors for VM.
+    vp_count: u32,
+    /// Vtl2AddressRangeSize - Vtl2MmioAddressRangeSize.
+    vtl2_memory_mb: u32,
+    /// DMA hint in MiB.
+    dma_hint_mb: u32,
+}
+
+/// Lookup table for DMA hint calculation.
+const LookupTable: &'static [DmaLookupStruct] = &[
+    DmaLookupStruct {
+        vp_count: 2,
+        vtl2_memory_mb: 98,
+        dma_hint_mb: 4,
+    },
+    DmaLookupStruct {
+        vp_count: 4,
+        vtl2_memory_mb: 110,
+        dma_hint_mb: 6,
+    },
+    DmaLookupStruct {
+        vp_count: 8,
+        vtl2_memory_mb: 148,
+        dma_hint_mb: 10,
+    },
+    DmaLookupStruct {
+        vp_count: 16,
+        vtl2_memory_mb: 256,
+        dma_hint_mb: 18,
+    },
+    DmaLookupStruct {
+        vp_count: 32,
+        vtl2_memory_mb: 516,
+        dma_hint_mb: 36,
+    },
+    DmaLookupStruct {
+        vp_count: 48,
+        vtl2_memory_mb: 718,
+        dma_hint_mb: 52,
+    },
+    DmaLookupStruct {
+        vp_count: 64,
+        vtl2_memory_mb: 924,
+        dma_hint_mb: 68,
+    },
+    DmaLookupStruct {
+        vp_count: 96,
+        vtl2_memory_mb: 1340,
+        dma_hint_mb: 102,
+    },
+];
+
 pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 {
+    let mut dma_hint = 0;
     log!("YSP: vp_count = {}", vp_count);
     let mem_size = storage
         .vtl2_ram
@@ -15,6 +69,23 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
         .filter(|m| m.mem_type == MemoryMapEntryType::VTL2_PROTECTABLE)
         .map(|e| e.range.len())
         .sum::<u64>();
-    log!("YSP: mem_size = {}", mem_size);
-    16384
+    // Sanity check for the calculated memory size.
+    if mem_size > 0 && mem_size < 0xFFFFFFFF00000 {
+        let mem_size_mb = (mem_size / 1048576) as u32;
+        log!("YSP: mem_size_mb = {}", mem_size_mb);
+
+        LookupTable
+            .iter()
+            .filter(|e| e.vp_count == vp_count as u32)
+            .for_each(|f| {
+                if f.vtl2_memory_mb == mem_size_mb {
+                    // Found exact match.
+                    dma_hint = f.dma_hint_mb as u64;
+                } else {
+                    // Try to extrapolate based on similar values.
+                }
+        });
+    }
+
+    dma_hint
 }

From 3d744e2f9abf38cf279e2f474f186950354941c8 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 10 Apr 2025 23:32:52 +0000
Subject: [PATCH 05/18] Minor styling

---
 openhcl/openhcl_boot/src/host_params/dma_hint.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index 410cbf8772..cb4c9df1ef 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -4,7 +4,7 @@
 //! Calculate DMA hint value if not provided by host.
 
 use crate::log;
-use igvm_defs::MemoryMapEntryType;
+use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K};
 use super::PartitionInfo;
 
 struct DmaLookupStruct {
@@ -17,7 +17,7 @@ struct DmaLookupStruct {
 }
 
 /// Lookup table for DMA hint calculation.
-const LookupTable: &'static [DmaLookupStruct] = &[
+const LOOKUP_TABLE: &'static [DmaLookupStruct] = &[
     DmaLookupStruct {
         vp_count: 2,
         vtl2_memory_mb: 98,
@@ -60,8 +60,9 @@ const LookupTable: &'static [DmaLookupStruct] = &[
     },
 ];
 
+/// Returns calculated DMA hint value, in 4k pages.
 pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 {
-    let mut dma_hint = 0;
+    let mut dma_hint_4k = 0;
     log!("YSP: vp_count = {}", vp_count);
     let mem_size = storage
         .vtl2_ram
@@ -74,18 +75,18 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
         let mem_size_mb = (mem_size / 1048576) as u32;
         log!("YSP: mem_size_mb = {}", mem_size_mb);
 
-        LookupTable
+        LOOKUP_TABLE
             .iter()
             .filter(|e| e.vp_count == vp_count as u32)
             .for_each(|f| {
                 if f.vtl2_memory_mb == mem_size_mb {
                     // Found exact match.
-                    dma_hint = f.dma_hint_mb as u64;
+                    dma_hint_4k = f.dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K;
                 } else {
                     // Try to extrapolate based on similar values.
                 }
         });
     }
 
-    dma_hint
+    dma_hint_4k
 }

From 5670edfe2b1b1dcd39cec780b10b47c64d9f17fc Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Tue, 15 Apr 2025 19:11:17 +0000
Subject: [PATCH 06/18] Add unit test

---
 .../openhcl_boot/src/host_params/dma_hint.rs  | 80 +++++++++++++++++--
 1 file changed, 74 insertions(+), 6 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index cb4c9df1ef..2b82707017 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -75,18 +75,86 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
         let mem_size_mb = (mem_size / 1048576) as u32;
         log!("YSP: mem_size_mb = {}", mem_size_mb);
 
-        LOOKUP_TABLE
-            .iter()
-            .filter(|e| e.vp_count == vp_count as u32)
-            .for_each(|f| {
+        let mut min_vtl2_memory_mb = 1000000;
+        let mut max_vtl2_memory_mb = 0;
+        // TODO: If we won't allow floats in boot-shim, replace with scaled integers
+        let mut min_ratio: f32 = 0.1;
+        let mut max_ratio: f32 = 0.01;
+
+        let mut min_vp_count = 1;
+        let mut max_vp_count = 4096;
+
+        for f in LOOKUP_TABLE {
+            if f.vp_count == vp_count as u32 {
                 if f.vtl2_memory_mb == mem_size_mb {
                     // Found exact match.
                     dma_hint_4k = f.dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K;
+                    break;
                 } else {
-                    // Try to extrapolate based on similar values.
+                    // Prepare for possible extrapolation.
+                    min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb);
+                    max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb);
+                    min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
+                    max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
                 }
-        });
+            } else if f.vp_count < vp_count as u32 {
+                // Find the nearest VP counts if exact match is not in the table.
+                min_vp_count = min_vp_count.max(f.vp_count);
+            } else if f.vp_count > vp_count as u32 {
+                max_vp_count = max_vp_count.min(f.vp_count);
+            }
+        }
+
+        // It is possible there were no matching entries in the lookup table.
+        // (i.e. unexpected VP count).
+        if max_vtl2_memory_mb == 0 {
+            LOOKUP_TABLE
+            .iter()
+            .filter(|e| e.vp_count == min_vp_count || e.vp_count == max_vp_count)
+            .for_each(|f| {
+                // Prepare for possible extrapolation.
+                min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb);
+                max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb);
+                min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
+                max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
+            });
+        }
+        
+        if dma_hint_4k == 0 {
+            // If we didn't find an exact match for our vp_count, try to extrapolate.
+            dma_hint_4k = (mem_size_mb as f32 * ((min_ratio + max_ratio) / 2.0)) as u64 *
+                1048576 /
+                PAGE_SIZE_4K;
+        }
     }
 
     dma_hint_4k
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::host_params::MemoryEntry;
+    use crate::MemoryRange;
+
+    #[test]
+    fn test_vtl2_calculate_dma_hint() {
+        let mut storage = PartitionInfo::new();
+        
+        storage.vtl2_ram.clear();
+        storage.vtl2_ram.push(MemoryEntry {
+            range: MemoryRange::new(0x0..0x6200000),
+            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
+            vnode: 0,
+        });
+        assert_eq!(vtl2_calculate_dma_hint(2, &storage), 1024);
+        
+        storage.vtl2_ram.clear();
+        storage.vtl2_ram.push(MemoryEntry {
+            range: MemoryRange::new(0x0..0x6E00000),
+            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
+            vnode: 0,
+        });
+        assert_eq!(vtl2_calculate_dma_hint(4, &storage), 1536);
+    }
+}

From 536e7bc46b3351e2d0b506f3fb29f2df01afeebf Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Tue, 15 Apr 2025 19:38:04 +0000
Subject: [PATCH 07/18] More tests

---
 openhcl/openhcl_boot/src/host_params/dma_hint.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index 2b82707017..55ad0aef77 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -156,5 +156,15 @@ mod test {
             vnode: 0,
         });
         assert_eq!(vtl2_calculate_dma_hint(4, &storage), 1536);
+
+        // Test VP count higher than max from LOOKUP_TABLE.
+        storage.vtl2_ram.clear();
+        storage.vtl2_ram.push(MemoryEntry {
+            range: MemoryRange::new(0x0..0x7000000),
+            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
+            vnode: 0,
+        });
+        // TODO: Unfinished, maybe the return value is incorrect.
+        assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2048);
     }
 }

From 936d26265f698c7427d0915b7d9d4a59ca77c882 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 17 Apr 2025 11:07:52 -0700
Subject: [PATCH 08/18] Replace float point with integers

---
 Cargo.lock                                    |  1 +
 openhcl/openhcl_boot/Cargo.toml               |  3 +
 openhcl/openhcl_boot/src/dt.rs                |  2 -
 .../openhcl_boot/src/host_params/dma_hint.rs  | 57 +++++++++++++------
 openhcl/openhcl_boot/src/host_params/dt.rs    |  4 --
 5 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index dda653c599..13cbe2d9d2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4786,6 +4786,7 @@ dependencies = [
  "sha2",
  "sidecar_defs",
  "tdcall",
+ "test_with_tracing",
  "underhill_confidentiality",
  "x86defs",
  "zerocopy 0.8.24",
diff --git a/openhcl/openhcl_boot/Cargo.toml b/openhcl/openhcl_boot/Cargo.toml
index f8d8a70e0a..0e9cbe6b14 100644
--- a/openhcl/openhcl_boot/Cargo.toml
+++ b/openhcl/openhcl_boot/Cargo.toml
@@ -37,3 +37,6 @@ minimal_rt_build.workspace = true
 
 [lints]
 workspace = true
+
+[dev-dependencies]
+test_with_tracing.workspace = true
diff --git a/openhcl/openhcl_boot/src/dt.rs b/openhcl/openhcl_boot/src/dt.rs
index 15393f0e70..3a85058c9c 100644
--- a/openhcl/openhcl_boot/src/dt.rs
+++ b/openhcl/openhcl_boot/src/dt.rs
@@ -9,7 +9,6 @@ use crate::ReservedMemoryType;
 use crate::host_params::COMMAND_LINE_SIZE;
 use crate::host_params::PartitionInfo;
 use crate::host_params::shim_params::IsolationType;
-use crate::log; // YSP
 use crate::sidecar::SidecarConfig;
 use crate::single_threaded::off_stack;
 use arrayvec::ArrayString;
@@ -212,7 +211,6 @@ pub fn write_dt(
     let p_enable_method = builder.add_string("enable-method")?;
 
     let num_cpus = partition_info.cpus.len();
-    log!("YSP: num_cpus2 = {}", num_cpus);
 
     let mut root_builder = builder
         .start_node("")?
diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index 55ad0aef77..1f7ccfd46c 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -3,7 +3,6 @@
 
 //! Calculate DMA hint value if not provided by host.
 
-use crate::log;
 use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K};
 use super::PartitionInfo;
 
@@ -60,10 +59,14 @@ const LOOKUP_TABLE: &'static [DmaLookupStruct] = &[
     },
 ];
 
+/// Round up to next 2MiB.
+fn round_up_to_2mb(pages_4k: u64) -> u64 {
+    (pages_4k + 511) & !(511)
+}
+
 /// Returns calculated DMA hint value, in 4k pages.
 pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64 {
     let mut dma_hint_4k = 0;
-    log!("YSP: vp_count = {}", vp_count);
     let mem_size = storage
         .vtl2_ram
         .iter()
@@ -73,16 +76,16 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
     // Sanity check for the calculated memory size.
     if mem_size > 0 && mem_size < 0xFFFFFFFF00000 {
         let mem_size_mb = (mem_size / 1048576) as u32;
-        log!("YSP: mem_size_mb = {}", mem_size_mb);
 
         let mut min_vtl2_memory_mb = 1000000;
         let mut max_vtl2_memory_mb = 0;
-        // TODO: If we won't allow floats in boot-shim, replace with scaled integers
-        let mut min_ratio: f32 = 0.1;
-        let mut max_ratio: f32 = 0.01;
+
+        // To avoid using floats, scale ratios to 1:1000.
+        let mut min_ratio_1000th = 100000;
+        let mut max_ratio_1000th = 1000;
 
         let mut min_vp_count = 1;
-        let mut max_vp_count = 4096;
+        let mut max_vp_count = vp_count as u32;
 
         for f in LOOKUP_TABLE {
             if f.vp_count == vp_count as u32 {
@@ -94,8 +97,8 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
                     // Prepare for possible extrapolation.
                     min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb);
                     max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb);
-                    min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
-                    max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
+                    min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
+                    max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
                 }
             } else if f.vp_count < vp_count as u32 {
                 // Find the nearest VP counts if exact match is not in the table.
@@ -112,19 +115,20 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
             .iter()
             .filter(|e| e.vp_count == min_vp_count || e.vp_count == max_vp_count)
             .for_each(|f| {
-                // Prepare for possible extrapolation.
                 min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb);
                 max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb);
-                min_ratio = min_ratio.min(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
-                max_ratio = max_ratio.max(f.dma_hint_mb as f32 / f.vtl2_memory_mb as f32);
+                min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
+                max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
             });
         }
-        
+
         if dma_hint_4k == 0 {
             // If we didn't find an exact match for our vp_count, try to extrapolate.
-            dma_hint_4k = (mem_size_mb as f32 * ((min_ratio + max_ratio) / 2.0)) as u64 *
-                1048576 /
-                PAGE_SIZE_4K;
+            dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K)) /
+                ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64);
+
+            // And then round up to 2MiB.
+            dma_hint_4k = round_up_to_2mb(dma_hint_4k);
         }
     }
 
@@ -136,6 +140,7 @@ mod test {
     use super::*;
     use crate::host_params::MemoryEntry;
     use crate::MemoryRange;
+    use test_with_tracing::test;
 
     #[test]
     fn test_vtl2_calculate_dma_hint() {
@@ -164,7 +169,23 @@ mod test {
             mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
             vnode: 0,
         });
-        // TODO: Unfinished, maybe the return value is incorrect.
-        assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2048);
+        assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2560);
+
+        // Test unusual VP count.
+        storage.vtl2_ram.clear();
+        storage.vtl2_ram.push(MemoryEntry {
+            range: MemoryRange::new(0x0..0x6000000),
+            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
+            vnode: 0,
+        });
+        assert_eq!(vtl2_calculate_dma_hint(52, &storage), 2048);
+
+        storage.vtl2_ram.clear();
+        storage.vtl2_ram.push(MemoryEntry {
+            range: MemoryRange::new(0x0..0x8000000),
+            mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
+            vnode: 0,
+        });
+        assert_eq!(vtl2_calculate_dma_hint(52, &storage), 2560);
     }
 }
diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs
index 886f50f307..5fb0d1870d 100644
--- a/openhcl/openhcl_boot/src/host_params/dt.rs
+++ b/openhcl/openhcl_boot/src/host_params/dt.rs
@@ -338,7 +338,6 @@ impl PartitionInfo {
         let parsed = ParsedDeviceTree::parse(dt, &mut *dt_storage).map_err(DtError::DeviceTree)?;
 
         let command_line = params.command_line();
-        log!("YSP: device_dma_page_count {}", parsed.device_dma_page_count.unwrap_or(0));
 
         // Always write the measured command line.
         write!(
@@ -360,7 +359,6 @@ impl PartitionInfo {
 
         match parsed.memory_allocation_mode {
             MemoryAllocationMode::Host => {
-                log!("YSP: MemoryAllocationMode::Host");
                 storage.vtl2_ram.clear();
                 storage
                     .vtl2_ram
@@ -372,7 +370,6 @@ impl PartitionInfo {
                 memory_size,
                 mmio_size,
             } => {
-                log!("YSP: MemoryAllocationMode::Vtl2");
                 storage.vtl2_ram.clear();
                 storage
                     .vtl2_ram
@@ -515,7 +512,6 @@ impl PartitionInfo {
 
             storage.vtl2_pool_memory = pool;
         }
-        log!("YSP: vtl2_gpa_pool_size {}", vtl2_gpa_pool_size);
 
         // If we can trust the host, use the provided alias map
         if can_trust_host {

From 3b37398af33e9021e4c2ee497665d814bb02ed9d Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 17 Apr 2025 15:48:21 -0700
Subject: [PATCH 09/18] Make it more compact

---
 .../openhcl_boot/src/host_params/dma_hint.rs  | 126 +++++++-----------
 1 file changed, 48 insertions(+), 78 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index 1f7ccfd46c..a62a4e0b4f 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -6,57 +6,21 @@
 use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K};
 use super::PartitionInfo;
 
-struct DmaLookupStruct {
-    /// Logical processors for VM.
-    vp_count: u32,
-    /// Vtl2AddressRangeSize - Vtl2MmioAddressRangeSize.
-    vtl2_memory_mb: u32,
-    /// DMA hint in MiB.
-    dma_hint_mb: u32,
-}
-
 /// Lookup table for DMA hint calculation.
-const LOOKUP_TABLE: &'static [DmaLookupStruct] = &[
-    DmaLookupStruct {
-        vp_count: 2,
-        vtl2_memory_mb: 98,
-        dma_hint_mb: 4,
-    },
-    DmaLookupStruct {
-        vp_count: 4,
-        vtl2_memory_mb: 110,
-        dma_hint_mb: 6,
-    },
-    DmaLookupStruct {
-        vp_count: 8,
-        vtl2_memory_mb: 148,
-        dma_hint_mb: 10,
-    },
-    DmaLookupStruct {
-        vp_count: 16,
-        vtl2_memory_mb: 256,
-        dma_hint_mb: 18,
-    },
-    DmaLookupStruct {
-        vp_count: 32,
-        vtl2_memory_mb: 516,
-        dma_hint_mb: 36,
-    },
-    DmaLookupStruct {
-        vp_count: 48,
-        vtl2_memory_mb: 718,
-        dma_hint_mb: 52,
-    },
-    DmaLookupStruct {
-        vp_count: 64,
-        vtl2_memory_mb: 924,
-        dma_hint_mb: 68,
-    },
-    DmaLookupStruct {
-        vp_count: 96,
-        vtl2_memory_mb: 1340,
-        dma_hint_mb: 102,
-    },
+/// Using tuples instead of structs to keep it readable.
+/// Let's keep the table sorted by VP count, then by assigned memory.
+/// Using u16 to keep the memory req short.
+/// Max VTL2 memory known today is 24838 MiB.
+/// (vp_count, vtl2_memory_mb, dma_hint_mb)
+const LOOKUP_TABLE: &[(u16, u16, u16)] = &[
+    (2, 98, 4),
+    (4, 110, 6),
+    (8, 148, 10),
+    (16, 256, 18),
+    (32, 516, 36),
+    (48, 718, 52),
+    (64, 924, 68),
+    (96, 1340, 102),
 ];
 
 /// Round up to next 2MiB.
@@ -77,34 +41,40 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
     if mem_size > 0 && mem_size < 0xFFFFFFFF00000 {
         let mem_size_mb = (mem_size / 1048576) as u32;
 
-        let mut min_vtl2_memory_mb = 1000000;
+        let mut min_vtl2_memory_mb = 65535;
         let mut max_vtl2_memory_mb = 0;
 
         // To avoid using floats, scale ratios to 1:1000.
         let mut min_ratio_1000th = 100000;
         let mut max_ratio_1000th = 1000;
 
-        let mut min_vp_count = 1;
-        let mut max_vp_count = vp_count as u32;
-
-        for f in LOOKUP_TABLE {
-            if f.vp_count == vp_count as u32 {
-                if f.vtl2_memory_mb == mem_size_mb {
-                    // Found exact match.
-                    dma_hint_4k = f.dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K;
-                    break;
-                } else {
-                    // Prepare for possible extrapolation.
-                    min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb);
-                    max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb);
-                    min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
-                    max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
+        let mut min_vp_count: u16 = 1;
+        let mut max_vp_count = vp_count as u16;
+
+        for (vp_lookup, vtl2_memory_mb, dma_hint_mb) in LOOKUP_TABLE {
+            match (*vp_lookup).cmp(&(vp_count as u16)) {
+                core::cmp::Ordering::Less => {
+                    // Find nearest.
+                    min_vp_count = min_vp_count.max(*vp_lookup);
+                }
+                core::cmp::Ordering::Equal => {
+                    if *vtl2_memory_mb == mem_size_mb as u16 {
+                        // Found exact match.
+                        dma_hint_4k = *dma_hint_mb as u64 * 1048576 / PAGE_SIZE_4K;
+                        max_vtl2_memory_mb = *vtl2_memory_mb;
+                        break;
+                    } else {
+                        // Prepare for possible extrapolation.
+                        min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb);
+                        max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb);
+                        min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
+                        max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
+                    }
+                }
+                core::cmp::Ordering::Greater => {
+                    // Find nearest.
+                    max_vp_count = max_vp_count.min(*vp_lookup);
                 }
-            } else if f.vp_count < vp_count as u32 {
-                // Find the nearest VP counts if exact match is not in the table.
-                min_vp_count = min_vp_count.max(f.vp_count);
-            } else if f.vp_count > vp_count as u32 {
-                max_vp_count = max_vp_count.min(f.vp_count);
             }
         }
 
@@ -113,17 +83,17 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
         if max_vtl2_memory_mb == 0 {
             LOOKUP_TABLE
             .iter()
-            .filter(|e| e.vp_count == min_vp_count || e.vp_count == max_vp_count)
-            .for_each(|f| {
-                min_vtl2_memory_mb = min_vtl2_memory_mb.min(f.vtl2_memory_mb);
-                max_vtl2_memory_mb = max_vtl2_memory_mb.max(f.vtl2_memory_mb);
-                min_ratio_1000th = min_ratio_1000th.min(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
-                max_ratio_1000th = max_ratio_1000th.max(f.vtl2_memory_mb as u32 * 1000 / f.dma_hint_mb as u32);
+            .filter(|(vp_lookup, _, _)| *vp_lookup == min_vp_count || *vp_lookup == max_vp_count)
+            .for_each(|(_, vtl2_memory_mb, dma_hint_mb)| {
+                min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb);
+                max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb);
+                min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
+                max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
             });
         }
 
         if dma_hint_4k == 0 {
-            // If we didn't find an exact match for our vp_count, try to extrapolate.
+            // Didn't find an exact match for vp_count, try to extrapolate.
             dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K)) /
                 ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64);
 

From 989c2c962ea6eea8af1726619c2a59cd5c514868 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Fri, 18 Apr 2025 00:42:44 +0000
Subject: [PATCH 10/18] Expand the table

---
 .../openhcl_boot/src/host_params/dma_hint.rs  | 34 +++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index a62a4e0b4f..7ffd62f569 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -13,14 +13,44 @@ use super::PartitionInfo;
 /// Max VTL2 memory known today is 24838 MiB.
 /// (vp_count, vtl2_memory_mb, dma_hint_mb)
 const LOOKUP_TABLE: &[(u16, u16, u16)] = &[
+    (2, 96, 2),
     (2, 98, 4),
+    (2, 100, 4),
+    (2, 104, 4),
+    (4, 108, 2),
     (4, 110, 6),
+    (4, 112, 6),
+    (4, 118, 8),
+    (4, 130, 12),
+    (8, 140, 4),
     (8, 148, 10),
-    (16, 256, 18),
+    (8, 170, 20),
+    (8, 176, 20),
+    (16, 234, 12),
+    (16, 256, 18), // There is another configuration with '20'.
+    (16, 268, 38),
+    (16, 282, 54),
+    (24, 420, 66),
+    (32, 404, 22),
     (32, 516, 36),
+    (32, 538, 74), // There is another configuration with '52'.
+    (48, 558, 32),
     (48, 718, 52),
+    (48, 730, 52),
+    (48, 746, 78),
+    (64, 712, 42),
     (64, 924, 68),
+    (64, 938, 68),
+    (96, 1030, 64),
+    (96, 1042, 114), // Can be '64'.
+    (96, 1058, 114), // Can be '106'.
     (96, 1340, 102),
+    (96, 1358, 104),
+    (96, 1382, 120),
+    (112, 1566, 288),
+    (128, 1342, 84),
+    (128, 1360, 84),
+    // (896, 12912, 516), // Needs to be validated as the vNIC number is unknown.
 ];
 
 /// Round up to next 2MiB.
@@ -139,7 +169,7 @@ mod test {
             mem_type: MemoryMapEntryType::VTL2_PROTECTABLE,
             vnode: 0,
         });
-        assert_eq!(vtl2_calculate_dma_hint(112, &storage), 2560);
+        assert_eq!(vtl2_calculate_dma_hint(112, &storage), 5632);
 
         // Test unusual VP count.
         storage.vtl2_ram.clear();

From 3ad70408564a5d6649c02c6480ce733bdbdc06e7 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Wed, 23 Apr 2025 23:48:57 +0000
Subject: [PATCH 11/18] Some progress to propagate fallback flag uphill but not
 quite done yet

---
 .../openhcl_boot/src/host_params/dma_hint.rs  | 38 +++++----
 openhcl/openhcl_boot/src/host_params/dt.rs    |  9 ++-
 openhcl/openhcl_dma_manager/src/lib.rs        |  2 +-
 openhcl/underhill_core/src/dispatch/mod.rs    | 11 ++-
 openhcl/underhill_core/src/emuplat/netvsp.rs  |  2 +-
 openhcl/underhill_core/src/lib.rs             |  2 +-
 openhcl/underhill_core/src/nvme_manager.rs    | 78 +++++++++++++++----
 openhcl/underhill_core/src/options.rs         |  6 +-
 openhcl/underhill_core/src/worker.rs          | 11 ++-
 .../nvme_driver/fuzz/fuzz_emulated_device.rs  |  5 ++
 .../disk_nvme/nvme_driver/src/driver.rs       | 20 +++--
 .../disk_nvme/nvme_driver/src/queue_pair.rs   | 38 +++++++--
 .../disk_nvme/nvme_driver/src/tests.rs        |  5 ++
 vm/devices/user_driver/src/lib.rs             |  3 +
 vm/devices/user_driver/src/vfio.rs            | 21 ++++-
 .../user_driver_emulated_mock/src/lib.rs      |  5 ++
 16 files changed, 195 insertions(+), 61 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index 7ffd62f569..e0925a74e9 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -3,8 +3,8 @@
 
 //! Calculate DMA hint value if not provided by host.
 
-use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K};
 use super::PartitionInfo;
+use igvm_defs::{MemoryMapEntryType, PAGE_SIZE_4K};
 
 /// Lookup table for DMA hint calculation.
 /// Using tuples instead of structs to keep it readable.
@@ -97,8 +97,10 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
                         // Prepare for possible extrapolation.
                         min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb);
                         max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb);
-                        min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
-                        max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
+                        min_ratio_1000th = min_ratio_1000th
+                            .min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
+                        max_ratio_1000th = max_ratio_1000th
+                            .max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
                     }
                 }
                 core::cmp::Ordering::Greater => {
@@ -112,20 +114,24 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
         // (i.e. unexpected VP count).
         if max_vtl2_memory_mb == 0 {
             LOOKUP_TABLE
-            .iter()
-            .filter(|(vp_lookup, _, _)| *vp_lookup == min_vp_count || *vp_lookup == max_vp_count)
-            .for_each(|(_, vtl2_memory_mb, dma_hint_mb)| {
-                min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb);
-                max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb);
-                min_ratio_1000th = min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
-                max_ratio_1000th = max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
-            });
+                .iter()
+                .filter(|(vp_lookup, _, _)| {
+                    *vp_lookup == min_vp_count || *vp_lookup == max_vp_count
+                })
+                .for_each(|(_, vtl2_memory_mb, dma_hint_mb)| {
+                    min_vtl2_memory_mb = min_vtl2_memory_mb.min(*vtl2_memory_mb);
+                    max_vtl2_memory_mb = max_vtl2_memory_mb.max(*vtl2_memory_mb);
+                    min_ratio_1000th =
+                        min_ratio_1000th.min(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
+                    max_ratio_1000th =
+                        max_ratio_1000th.max(*vtl2_memory_mb as u32 * 1000 / *dma_hint_mb as u32);
+                });
         }
 
         if dma_hint_4k == 0 {
             // Didn't find an exact match for vp_count, try to extrapolate.
-            dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K)) /
-                ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64);
+            dma_hint_4k = (mem_size_mb as u64 * 1000u64 * (1048576u64 / PAGE_SIZE_4K))
+                / ((min_ratio_1000th + max_ratio_1000th) as u64 / 2u64);
 
             // And then round up to 2MiB.
             dma_hint_4k = round_up_to_2mb(dma_hint_4k);
@@ -138,14 +144,14 @@ pub fn vtl2_calculate_dma_hint(vp_count: usize, storage: &PartitionInfo) -> u64
 #[cfg(test)]
 mod test {
     use super::*;
-    use crate::host_params::MemoryEntry;
     use crate::MemoryRange;
+    use crate::host_params::MemoryEntry;
     use test_with_tracing::test;
 
     #[test]
     fn test_vtl2_calculate_dma_hint() {
         let mut storage = PartitionInfo::new();
-        
+
         storage.vtl2_ram.clear();
         storage.vtl2_ram.push(MemoryEntry {
             range: MemoryRange::new(0x0..0x6200000),
@@ -153,7 +159,7 @@ mod test {
             vnode: 0,
         });
         assert_eq!(vtl2_calculate_dma_hint(2, &storage), 1024);
-        
+
         storage.vtl2_ram.clear();
         storage.vtl2_ram.push(MemoryEntry {
             range: MemoryRange::new(0x0..0x6E00000),
diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs
index 5fb0d1870d..fbadf47feb 100644
--- a/openhcl/openhcl_boot/src/host_params/dt.rs
+++ b/openhcl/openhcl_boot/src/host_params/dt.rs
@@ -465,10 +465,11 @@ impl PartitionInfo {
                     .enable_vtl2_gpa_pool;
 
             let hostval = max(dt_page_count.unwrap_or(0), cmdline_page_count.unwrap_or(0));
-            if hostval == 0 &&
-                parsed.nvme_keepalive &&
-                params.isolation_type == IsolationType::None &&
-                storage.memory_allocation_mode == MemoryAllocationMode::Host {
+            if hostval == 0
+                && parsed.nvme_keepalive
+                && params.isolation_type == IsolationType::None
+                && storage.memory_allocation_mode == MemoryAllocationMode::Host
+            {
                 // If host did not provide the DMA hint value, re-evaluate
                 // it internally if conditions satisfy.
                 vtl2_calculate_dma_hint(parsed.cpu_count(), storage)
diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs
index 42e79d7297..64db4d2b0a 100644
--- a/openhcl/openhcl_dma_manager/src/lib.rs
+++ b/openhcl/openhcl_dma_manager/src/lib.rs
@@ -122,7 +122,7 @@ pub struct OpenhclDmaManager {
 }
 
 /// The required VTL permissions on DMA allocations.
-#[derive(Inspect)]
+#[derive(Clone, Inspect)]
 pub enum LowerVtlPermissionPolicy {
     /// No specific permission constraints are required.
     Any,
diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs
index bf3c460597..04db01ad34 100644
--- a/openhcl/underhill_core/src/dispatch/mod.rs
+++ b/openhcl/underhill_core/src/dispatch/mod.rs
@@ -7,6 +7,7 @@ mod pci_shutdown;
 pub mod vtl2_settings_worker;
 
 use self::vtl2_settings_worker::DeviceInterfaces;
+use crate::nvme_manager;
 use crate::ControlRequest;
 use crate::emuplat::EmuplatServicing;
 use crate::emuplat::netvsp::RuntimeSavedState;
@@ -180,7 +181,7 @@ pub(crate) struct LoadedVm {
 
     pub _periodic_telemetry_task: Task<()>,
 
-    pub nvme_keep_alive: bool,
+    pub nvme_keepalive: bool,
     pub test_configuration: Option<TestScenarioConfig>,
     pub dma_manager: OpenhclDmaManager,
 }
@@ -494,7 +495,13 @@ impl LoadedVm {
 
         // NOTE: This is set via the corresponding env arg, as this feature is
         // experimental.
-        let nvme_keepalive = self.nvme_keep_alive && capabilities_flags.enable_nvme_keepalive();
+        let nvme_keepalive_runtime_enabled = if let Some(nvme_manager) = self.nvme_manager.as_ref() {
+            nvme_manager.is_keepalive_still_enabled().await
+        } else {
+            false
+        };
+        let nvme_keepalive = self.nvme_keepalive && capabilities_flags.enable_nvme_keepalive() && nvme_keepalive_runtime_enabled;
+        tracing::info!("YSP: {nvme_keepalive_runtime_enabled} {nvme_keepalive}");
 
         // Do everything before the log flush under a span.
         let r = async {
diff --git a/openhcl/underhill_core/src/emuplat/netvsp.rs b/openhcl/underhill_core/src/emuplat/netvsp.rs
index bf517208fb..9398e0fe21 100644
--- a/openhcl/underhill_core/src/emuplat/netvsp.rs
+++ b/openhcl/underhill_core/src/emuplat/netvsp.rs
@@ -120,7 +120,7 @@ async fn try_create_mana_device(
     max_sub_channels: u16,
     dma_client: Arc<dyn DmaClient>,
 ) -> anyhow::Result<ManaDevice<VfioDevice>> {
-    let device = VfioDevice::new(driver_source, pci_id, dma_client)
+    let device = VfioDevice::new(driver_source, pci_id, dma_client, None)
         .await
         .context("failed to open device")?;
 
diff --git a/openhcl/underhill_core/src/lib.rs b/openhcl/underhill_core/src/lib.rs
index 41038dbd07..5f101eebb9 100644
--- a/openhcl/underhill_core/src/lib.rs
+++ b/openhcl/underhill_core/src/lib.rs
@@ -318,7 +318,7 @@ async fn launch_workers(
         no_sidecar_hotplug: opt.no_sidecar_hotplug,
         gdbstub: opt.gdbstub,
         hide_isolation: opt.hide_isolation,
-        nvme_keep_alive: opt.nvme_keep_alive,
+        nvme_keepalive: opt.nvme_keepalive,
         test_configuration: opt.test_configuration,
         disable_uefi_frontpage: opt.disable_uefi_frontpage,
     };
diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs
index 2580996140..6bea0f422f 100644
--- a/openhcl/underhill_core/src/nvme_manager.rs
+++ b/openhcl/underhill_core/src/nvme_manager.rs
@@ -26,8 +26,10 @@ use pal_async::task::Spawn;
 use pal_async::task::Task;
 use std::collections::HashMap;
 use std::collections::hash_map;
+use std::sync::Arc;
 use thiserror::Error;
 use tracing::Instrument;
+use user_driver::DmaClient;
 use user_driver::vfio::VfioDevice;
 use vm_resource::AsyncResolveResource;
 use vm_resource::ResourceId;
@@ -126,6 +128,15 @@ impl NvmeManager {
         &self.client
     }
 
+    /// Save could have been disabled if fallback allocator was used.
+    pub async fn is_keepalive_still_enabled(&self) -> bool {
+        let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await {
+            Ok(s) => s,
+            Err(_) => false,
+        };
+        self.save_restore_supported && worker_save_restore
+    }
+
     pub async fn shutdown(self, nvme_keepalive: bool) {
         // Early return is faster way to skip shutdown.
         // but we need to thoroughly test the data integrity.
@@ -175,6 +186,7 @@ enum Request {
         span: tracing::Span,
         nvme_keepalive: bool,
     },
+    KeepAliveStatus(Rpc<(), bool>),
 }
 
 #[derive(Debug, Clone)]
@@ -267,6 +279,9 @@ impl NvmeManagerWorker {
                     }
                     break (span, nvme_keepalive);
                 }
+                Request::KeepAliveStatus(rpc) => {
+                    rpc.complete(self.save_restore_supported);
+                }
             }
         };
 
@@ -295,24 +310,52 @@ impl NvmeManagerWorker {
         let driver = match self.devices.entry(pci_id.to_owned()) {
             hash_map::Entry::Occupied(entry) => entry.into_mut(),
             hash_map::Entry::Vacant(entry) => {
+                let device_name = format!("nvme_{}", pci_id);
+                let allocation_visibility = if self.is_isolated {
+                    AllocationVisibility::Shared
+                } else {
+                    AllocationVisibility::Private
+                };
+                let lower_vtl_policy = LowerVtlPermissionPolicy::Any;
+
+                // Persistent allocations use fixed size memory.
+                // Create a fallback allocator which uses heap.
+                // When fallback allocator is involved, nvme_keepalive
+                // will be implicitly disabled.
+                let fallback_dma_client = if self.save_restore_supported {
+                    let client: Arc<dyn DmaClient> = self
+                        .dma_client_spawner
+                        .new_client(DmaClientParameters {
+                            device_name: device_name.clone(),
+                            lower_vtl_policy: lower_vtl_policy.clone(),
+                            allocation_visibility,
+                            persistent_allocations: false,
+                        })
+                        .map_err(InnerError::DmaClient)?;
+                    Some(client)
+                } else {
+                    None
+                };
+
                 let dma_client = self
                     .dma_client_spawner
                     .new_client(DmaClientParameters {
-                        device_name: format!("nvme_{}", pci_id),
-                        lower_vtl_policy: LowerVtlPermissionPolicy::Any,
-                        allocation_visibility: if self.is_isolated {
-                            AllocationVisibility::Shared
-                        } else {
-                            AllocationVisibility::Private
-                        },
+                        device_name,
+                        lower_vtl_policy,
+                        allocation_visibility,
                         persistent_allocations: self.save_restore_supported,
                     })
                     .map_err(InnerError::DmaClient)?;
 
-                let device = VfioDevice::new(&self.driver_source, entry.key(), dma_client)
-                    .instrument(tracing::info_span!("vfio_device_open", pci_id))
-                    .await
-                    .map_err(InnerError::Vfio)?;
+                let device = VfioDevice::new(
+                    &self.driver_source,
+                    entry.key(),
+                    dma_client,
+                    fallback_dma_client,
+                )
+                .instrument(tracing::info_span!("vfio_device_open", pci_id))
+                .await
+                .map_err(InnerError::Vfio)?;
 
                 let driver =
                     nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device)
@@ -380,10 +423,15 @@ impl NvmeManagerWorker {
             // This code can wait on each VFIO device until it is arrived.
             // A potential optimization would be to delay VFIO operation
             // until it is ready, but a redesign of VfioDevice is needed.
-            let vfio_device =
-                VfioDevice::restore(&self.driver_source, &disk.pci_id.clone(), true, dma_client)
-                    .instrument(tracing::info_span!("vfio_device_restore", pci_id))
-                    .await?;
+            let vfio_device = VfioDevice::restore(
+                &self.driver_source,
+                &disk.pci_id.clone(),
+                true,
+                dma_client,
+                None,
+            )
+            .instrument(tracing::info_span!("vfio_device_restore", pci_id))
+            .await?;
 
             let nvme_driver = nvme_driver::NvmeDriver::restore(
                 &self.driver_source,
diff --git a/openhcl/underhill_core/src/options.rs b/openhcl/underhill_core/src/options.rs
index 5dd9254f8c..647f1b8442 100644
--- a/openhcl/underhill_core/src/options.rs
+++ b/openhcl/underhill_core/src/options.rs
@@ -140,7 +140,7 @@ pub struct Options {
     pub no_sidecar_hotplug: bool,
 
     /// (OPENHCL_NVME_KEEP_ALIVE=1) Enable nvme keep alive when servicing.
-    pub nvme_keep_alive: bool,
+    pub nvme_keepalive: bool,
 
     /// (OPENHCL_TEST_CONFIG=\<TestScenarioConfig\>)
     /// Test configurations are designed to replicate specific behaviors and
@@ -237,7 +237,7 @@ impl Options {
         let no_sidecar_hotplug = parse_legacy_env_bool("OPENHCL_NO_SIDECAR_HOTPLUG");
         let gdbstub = parse_legacy_env_bool("OPENHCL_GDBSTUB");
         let gdbstub_port = parse_legacy_env_number("OPENHCL_GDBSTUB_PORT")?.map(|x| x as u32);
-        let nvme_keep_alive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE");
+        let nvme_keepalive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE");
         let test_configuration = parse_env_string("OPENHCL_TEST_CONFIG").and_then(|x| {
             x.to_string_lossy()
                 .parse::<TestScenarioConfig>()
@@ -304,7 +304,7 @@ impl Options {
             hide_isolation,
             halt_on_guest_halt,
             no_sidecar_hotplug,
-            nvme_keep_alive,
+            nvme_keepalive,
             test_configuration,
             disable_uefi_frontpage,
         })
diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs
index 6c668a387f..3913e8e657 100644
--- a/openhcl/underhill_core/src/worker.rs
+++ b/openhcl/underhill_core/src/worker.rs
@@ -280,8 +280,8 @@ pub struct UnderhillEnvCfg {
     pub gdbstub: bool,
     /// Hide the isolation mode from the guest.
     pub hide_isolation: bool,
-    /// Enable nvme keep alive.
-    pub nvme_keep_alive: bool,
+    /// Enable nvme keep-alive.
+    pub nvme_keepalive: bool,
 
     /// test configuration
     pub test_configuration: Option<TestScenarioConfig>,
@@ -1839,7 +1839,10 @@ async fn new_underhill_vm(
         // TODO: reevaluate enablement of nvme save restore when private pool
         // save restore to bootshim is available.
         let private_pool_available = !runtime_params.private_pool_ranges().is_empty();
-        let save_restore_supported = env_cfg.nvme_keep_alive && private_pool_available;
+        // Two separate flags because:
+        //  - private pool alone can be used for other purposes;
+        //  - host must explicitly indicate that keepalive is supported (compatibility).
+        let save_restore_supported = env_cfg.nvme_keepalive && private_pool_available;
 
         let manager = NvmeManager::new(
             &driver_source,
@@ -3004,7 +3007,7 @@ async fn new_underhill_vm(
         control_send,
 
         _periodic_telemetry_task: periodic_telemetry_task,
-        nvme_keep_alive: env_cfg.nvme_keep_alive,
+        nvme_keepalive: env_cfg.nvme_keepalive,
         test_configuration: env_cfg.test_configuration,
         dma_manager,
     };
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs
index 1463767ebb..0e8e5c2b29 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs
@@ -53,6 +53,11 @@ impl<T: 'static + Send + InspectMut + MmioIntercept, U: 'static + DmaClient> Dev
         self.device.dma_client()
     }
 
+    // Not implemented for Fuzzer yet.
+    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
+        None
+    }
+
     /// Arbitrarily decide to passthrough or return arbitrary value.
     fn max_interrupt_count(&self) -> u32 {
         // Case: Fuzz response
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
index 558be0a035..2dfc57e508 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
@@ -288,6 +288,11 @@ impl<T: DeviceBacking> NvmeDriver<T> {
         )
         .context("failed to create admin queue pair")?;
 
+        if admin.fallback_used() {
+            // Unconditionally disable keepalive if fallback allocator was involved.
+            self.nvme_keepalive = false;
+            tracing::info!("YSP: nvme_keepalive explicitly disabled");
+        }
         let admin = worker.admin.insert(admin);
 
         // Register the admin queue with the controller.
@@ -433,7 +438,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
 
         // Pre-create the IO queue 1 for CPU 0. The other queues will be created
         // lazily. Numbering for I/O queues starts with 1 (0 is Admin).
-        let issuer = worker
+        let (issuer, fallback_alloc) = worker
             .create_io_queue(&mut state, 0)
             .await
             .context("failed to create io queue 1")?;
@@ -807,7 +812,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
             return;
         }
 
-        let issuer = match self
+        let (issuer, fallback_alloc) = match self
             .create_io_queue(state, cpu)
             .instrument(info_span!("create_nvme_io_queue", cpu))
             .await
@@ -828,7 +833,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
                     error = err.as_ref() as &dyn std::error::Error,
                     "failed to create io queue, falling back"
                 );
-                fallback.clone()
+                (fallback.clone(), false)
             }
         };
 
@@ -842,7 +847,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
         &mut self,
         state: &mut WorkerState,
         cpu: u32,
-    ) -> anyhow::Result<IoIssuer> {
+    ) -> anyhow::Result<(IoIssuer, bool)> {
         if self.io.len() >= state.max_io_queues as usize {
             anyhow::bail!("no more io queues available");
         }
@@ -868,6 +873,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
             self.registers.clone(),
         )
         .with_context(|| format!("failed to create io queue pair {qid}"))?;
+        let fallback_used = queue.fallback_used();
 
         let io_sq_addr = queue.sq_addr();
         let io_cq_addr = queue.cq_addr();
@@ -939,10 +945,12 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
             return Err(err);
         }
 
-        Ok(IoIssuer {
+        Ok((IoIssuer {
             issuer: io_queue.queue.issuer().clone(),
             cpu,
-        })
+        },
+        fallback_used,
+        ))
     }
 
     /// Save NVMe driver state for servicing.
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
index 95ee213b1a..b52a6dc3b0 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
@@ -45,6 +45,13 @@ use zerocopy::FromZeros;
 /// Value for unused PRP entries, to catch/mitigate buffer size mismatches.
 const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1);
 
+/// Memory allocator errors.
+#[derive(Debug, Error)]
+pub enum AllocatorError {
+    #[error("no fallback allocator")]
+    NoFallbackAllocator,
+}
+
 pub(crate) struct QueuePair {
     task: Task<QueueHandler>,
     cancel: Cancel,
@@ -53,6 +60,7 @@ pub(crate) struct QueuePair {
     qid: u16,
     sq_entries: u16,
     cq_entries: u16,
+    fallback_alloc_used: bool,
 }
 
 impl Inspect for QueuePair {
@@ -65,6 +73,7 @@ impl Inspect for QueuePair {
             qid: _,
             sq_entries: _,
             cq_entries: _,
+            fallback_alloc_used: _,
         } = self;
         issuer.send.send(Req::Inspect(req.defer()));
     }
@@ -181,18 +190,29 @@ impl QueuePair {
         interrupt: DeviceInterrupt,
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
     ) -> anyhow::Result<Self> {
+        assert!(sq_entries <= Self::MAX_SQ_ENTRIES);
+        assert!(cq_entries <= Self::MAX_CQ_ENTRIES);
+
+        let mut fallback_involved = false;
         let total_size =
             QueuePair::SQ_SIZE + QueuePair::CQ_SIZE + QueuePair::PER_QUEUE_PAGES * PAGE_SIZE;
         let dma_client = device.dma_client();
         let mem = dma_client
             .allocate_dma_buffer(total_size)
-            .context("failed to allocate memory for queues")?;
-
-        assert!(sq_entries <= Self::MAX_SQ_ENTRIES);
-        assert!(cq_entries <= Self::MAX_CQ_ENTRIES);
+            .context("failed to allocate memory for the queues")
+            .or_else(|_| {
+                fallback_involved = true;
+                device.fallback_dma_client().map_or(
+                    Err(AllocatorError::NoFallbackAllocator.into()),
+                    |f| {
+                        f.allocate_dma_buffer(total_size)
+                            .context("fallback allocator also failed")
+                    },
+                )
+            });
 
         QueuePair::new_or_restore(
-            spawner, qid, sq_entries, cq_entries, interrupt, registers, mem, None,
+            spawner, qid, sq_entries, cq_entries, interrupt, registers, mem?, None, fallback_involved,
         )
     }
 
@@ -206,6 +226,7 @@ impl QueuePair {
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
         mem: MemoryBlock,
         saved_state: Option<&QueueHandlerSavedState>,
+        fallback_alloc_used: bool,
     ) -> anyhow::Result<Self> {
         // MemoryBlock is either allocated or restored prior calling here.
         let sq_mem_block = mem.subblock(0, QueuePair::SQ_SIZE);
@@ -255,6 +276,7 @@ impl QueuePair {
             qid,
             sq_entries,
             cq_entries,
+            fallback_alloc_used,
         })
     }
 
@@ -275,6 +297,11 @@ impl QueuePair {
         self.task.await
     }
 
+    /// Indicates that fallback allocator was used for this queue pair.
+    pub fn fallback_used(&self) -> bool {
+        self.fallback_alloc_used
+    }
+
     /// Save queue pair state for servicing.
     pub async fn save(&self) -> anyhow::Result<QueuePairSavedState> {
         // Return error if the queue does not have any memory allocated.
@@ -321,6 +348,7 @@ impl QueuePair {
             registers,
             mem,
             Some(handler_data),
+            false,
         )
     }
 }
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
index afaef105d7..16b827c46f 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
@@ -366,6 +366,11 @@ impl<T: 'static + Send + InspectMut + MmioIntercept, U: 'static + DmaClient> Dev
         self.device.dma_client()
     }
 
+    // TODO: We should add one for testing.
+    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
+        None
+    }
+
     fn max_interrupt_count(&self) -> u32 {
         self.device.max_interrupt_count()
     }
diff --git a/vm/devices/user_driver/src/lib.rs b/vm/devices/user_driver/src/lib.rs
index 2a451bc055..1461c382e8 100644
--- a/vm/devices/user_driver/src/lib.rs
+++ b/vm/devices/user_driver/src/lib.rs
@@ -33,6 +33,9 @@ pub trait DeviceBacking: 'static + Send + Inspect {
     /// DMA Client for the device.
     fn dma_client(&self) -> Arc<dyn DmaClient>;
 
+    /// Fallback DMA Client for the device.
+    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>>;
+
     /// Returns the maximum number of interrupts that can be mapped.
     fn max_interrupt_count(&self) -> u32;
 
diff --git a/vm/devices/user_driver/src/vfio.rs b/vm/devices/user_driver/src/vfio.rs
index 0bc6a7670b..137d1db334 100644
--- a/vm/devices/user_driver/src/vfio.rs
+++ b/vm/devices/user_driver/src/vfio.rs
@@ -57,6 +57,7 @@ pub struct VfioDevice {
     #[inspect(skip)]
     config_space: vfio_sys::RegionInfo,
     dma_client: Arc<dyn DmaClient>,
+    fallback_dma_client: Option<Arc<dyn DmaClient>>,
 }
 
 #[derive(Inspect)]
@@ -74,8 +75,16 @@ impl VfioDevice {
         driver_source: &VmTaskDriverSource,
         pci_id: &str,
         dma_client: Arc<dyn DmaClient>,
+        fallback_dma_client: Option<Arc<dyn DmaClient>>,
     ) -> anyhow::Result<Self> {
-        Self::restore(driver_source, pci_id, false, dma_client).await
+        Self::restore(
+            driver_source,
+            pci_id,
+            false,
+            dma_client,
+            fallback_dma_client,
+        )
+        .await
     }
 
     /// Creates a new VFIO-backed device for the PCI device with `pci_id`.
@@ -83,8 +92,9 @@ impl VfioDevice {
     pub async fn restore(
         driver_source: &VmTaskDriverSource,
         pci_id: &str,
-        keepalive: bool,
+        vf_keepalive: bool,
         dma_client: Arc<dyn DmaClient>,
+        fallback_dma_client: Option<Arc<dyn DmaClient>>,
     ) -> anyhow::Result<Self> {
         let path = Path::new("/sys/bus/pci/devices").join(pci_id);
 
@@ -110,7 +120,7 @@ impl VfioDevice {
         }
 
         container.set_iommu(IommuType::NoIommu)?;
-        if keepalive {
+        if vf_keepalive {
             // Prevent physical hardware interaction when restoring.
             group.set_keep_alive(pci_id)?;
         }
@@ -131,6 +141,7 @@ impl VfioDevice {
             driver_source: driver_source.clone(),
             interrupts: Vec::new(),
             dma_client,
+            fallback_dma_client,
         };
 
         // Ensure bus master enable and memory space enable are set, and that
@@ -235,6 +246,10 @@ impl DeviceBacking for VfioDevice {
         self.dma_client.clone()
     }
 
+    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
+        self.fallback_dma_client.clone()
+    }
+
     fn max_interrupt_count(&self) -> u32 {
         self.msix_info.count
     }
diff --git a/vm/devices/user_driver_emulated_mock/src/lib.rs b/vm/devices/user_driver_emulated_mock/src/lib.rs
index fb68810cd5..0ed650d957 100644
--- a/vm/devices/user_driver_emulated_mock/src/lib.rs
+++ b/vm/devices/user_driver_emulated_mock/src/lib.rs
@@ -162,6 +162,11 @@ impl<T: 'static + Send + InspectMut + MmioIntercept, U: 'static + Send + DmaClie
         self.dma_client.clone()
     }
 
+    // Not needed for Emulated.
+    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
+        None
+    }
+
     fn max_interrupt_count(&self) -> u32 {
         self.controller.events.len() as u32
     }

From 0f1227523153a5b99dd1071c6d75a0435fb1c909 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 24 Apr 2025 00:19:44 +0000
Subject: [PATCH 12/18] Simplify some return values

---
 openhcl/underhill_core/src/nvme_manager.rs    |  4 ++-
 .../disk_nvme/nvme_driver/src/driver.rs       | 30 +++++++++++--------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs
index 6bea0f422f..47e09205e1 100644
--- a/openhcl/underhill_core/src/nvme_manager.rs
+++ b/openhcl/underhill_core/src/nvme_manager.rs
@@ -277,7 +277,8 @@ impl NvmeManagerWorker {
                         // Prevent devices from originating controller reset in drop().
                         dev.update_servicing_flags(do_not_reset);
                     }
-                    break (span, nvme_keepalive);
+                    // Use final combined flag to report back.
+                    break (span, do_not_reset);
                 }
                 Request::KeepAliveStatus(rpc) => {
                     rpc.complete(self.save_restore_supported);
@@ -289,6 +290,7 @@ impl NvmeManagerWorker {
         // because the Shutdown request is never sent.
         //
         // Tear down all the devices if nvme_keepalive is not set.
+        // TODO: Since the loop above is returning combined flag, this condition can be simplified.
         if !nvme_keepalive || !self.save_restore_supported {
             async {
                 join_all(self.devices.drain().map(|(pci_id, driver)| {
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
index 2dfc57e508..3e4d344f3e 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
@@ -92,6 +92,8 @@ struct WorkerState {
     qsize: u16,
     #[inspect(skip)]
     async_event_task: Task<()>,
+    /// Tracks if fallback memory allocator was ever used.
+    fallback_used: bool,
 }
 
 /// An error restoring from saved state.
@@ -252,7 +254,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             io_issuers,
             rescan_event: Default::default(),
             namespaces: vec![],
-            nvme_keepalive: false,
+            nvme_keepalive: false, // In the beginning always assume it's not supported.
         })
     }
 
@@ -288,8 +290,9 @@ impl<T: DeviceBacking> NvmeDriver<T> {
         )
         .context("failed to create admin queue pair")?;
 
-        if admin.fallback_used() {
-            // Unconditionally disable keepalive if fallback allocator was involved.
+        let fallback_used = admin.fallback_used();
+        if fallback_used {
+            // Unconditionally override keepalive if fallback allocator was involved.
             self.nvme_keepalive = false;
             tracing::info!("YSP: nvme_keepalive explicitly disabled");
         }
@@ -432,13 +435,14 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             qsize,
             async_event_task,
             max_io_queues,
+            fallback_used, // May be updated in create_io_queue() below.
         };
 
         self.admin = Some(admin.issuer().clone());
 
         // Pre-create the IO queue 1 for CPU 0. The other queues will be created
         // lazily. Numbering for I/O queues starts with 1 (0 is Admin).
-        let (issuer, fallback_alloc) = worker
+        let issuer = worker
             .create_io_queue(&mut state, 0)
             .await
             .context("failed to create io queue 1")?;
@@ -586,7 +590,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             io_issuers,
             rescan_event: Default::default(),
             namespaces: vec![],
-            nvme_keepalive: true,
+            nvme_keepalive: true, // We know it is supported because we're in restore().
         };
 
         let task = &mut this.task.as_mut().unwrap();
@@ -640,6 +644,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             qsize: saved_state.worker_data.qsize,
             async_event_task,
             max_io_queues: saved_state.worker_data.max_io_queues,
+            fallback_used: false, // Because this is restore() which implies that everything was saved.
         };
 
         this.admin = Some(admin.issuer().clone());
@@ -812,7 +817,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
             return;
         }
 
-        let (issuer, fallback_alloc) = match self
+        let issuer = match self
             .create_io_queue(state, cpu)
             .instrument(info_span!("create_nvme_io_queue", cpu))
             .await
@@ -833,7 +838,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
                     error = err.as_ref() as &dyn std::error::Error,
                     "failed to create io queue, falling back"
                 );
-                (fallback.clone(), false)
+                fallback.clone()
             }
         };
 
@@ -847,7 +852,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
         &mut self,
         state: &mut WorkerState,
         cpu: u32,
-    ) -> anyhow::Result<(IoIssuer, bool)> {
+    ) -> anyhow::Result<IoIssuer> {
         if self.io.len() >= state.max_io_queues as usize {
             anyhow::bail!("no more io queues available");
         }
@@ -873,7 +878,8 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
             self.registers.clone(),
         )
         .with_context(|| format!("failed to create io queue pair {qid}"))?;
-        let fallback_used = queue.fallback_used();
+        // Keep tracking if fallback allocator was ever used.
+        state.fallback_used |= queue.fallback_used();
 
         let io_sq_addr = queue.sq_addr();
         let io_cq_addr = queue.cq_addr();
@@ -945,12 +951,10 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
             return Err(err);
         }
 
-        Ok((IoIssuer {
+        Ok(IoIssuer {
             issuer: io_queue.queue.issuer().clone(),
             cpu,
-        },
-        fallback_used,
-        ))
+        })
     }
 
     /// Save NVMe driver state for servicing.

From 0d796baf51884d1a0ca4ab19b455d60c4c28e311 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 24 Apr 2025 01:17:34 +0000
Subject: [PATCH 13/18] Closer to final code

---
 openhcl/underhill_core/src/dispatch/mod.rs    |  1 -
 openhcl/underhill_core/src/nvme_manager.rs    | 10 ++++++++++
 .../disk_nvme/nvme_driver/src/driver.rs       | 20 +++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs
index 04db01ad34..7cf443ff32 100644
--- a/openhcl/underhill_core/src/dispatch/mod.rs
+++ b/openhcl/underhill_core/src/dispatch/mod.rs
@@ -7,7 +7,6 @@ mod pci_shutdown;
 pub mod vtl2_settings_worker;
 
 use self::vtl2_settings_worker::DeviceInterfaces;
-use crate::nvme_manager;
 use crate::ControlRequest;
 use crate::emuplat::EmuplatServicing;
 use crate::emuplat::netvsp::RuntimeSavedState;
diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs
index 47e09205e1..6f61f8bc4d 100644
--- a/openhcl/underhill_core/src/nvme_manager.rs
+++ b/openhcl/underhill_core/src/nvme_manager.rs
@@ -281,6 +281,16 @@ impl NvmeManagerWorker {
                     break (span, do_not_reset);
                 }
                 Request::KeepAliveStatus(rpc) => {
+                    let mut fallback_used = false;
+                    for (_s, dev) in self.devices.iter_mut() {
+                        // Prevent devices from originating controller reset in drop().
+                        fallback_used |= dev.query_fallback_used().await;
+                    }
+                    if fallback_used {
+                        // If any of the attached devices ever used fallback allocator,
+                        // update internal tracking as well, and return the result.
+                        self.save_restore_supported = false;
+                    }
                     rpc.complete(self.save_restore_supported);
                 }
             }
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
index 3e4d344f3e..023631657f 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
@@ -163,6 +163,8 @@ enum NvmeWorkerRequest {
     CreateIssuer(Rpc<u32, ()>),
     /// Save worker state.
     Save(Rpc<(), anyhow::Result<NvmeDriverWorkerSavedState>>),
+    /// Query worker state.
+    QueryAllocatorFallback(Rpc<(), bool>),
 }
 
 impl<T: DeviceBacking> NvmeDriver<T> {
@@ -703,6 +705,21 @@ impl<T: DeviceBacking> NvmeDriver<T> {
     pub fn update_servicing_flags(&mut self, nvme_keepalive: bool) {
         self.nvme_keepalive = nvme_keepalive;
     }
+
+    /// Queries worker task if memory allocator ever fell back.
+    pub async fn query_fallback_used(&self) -> bool {
+        let fb = self
+            .io_issuers
+            .send
+            .call(NvmeWorkerRequest::QueryAllocatorFallback, ())
+            .await;
+        match fb {
+            Ok(fallback_used) => fallback_used,
+            Err(_) => {
+                false
+            }
+        }
+    }
 }
 
 async fn handle_asynchronous_events(
@@ -802,6 +819,9 @@ impl<T: DeviceBacking> AsyncRun<WorkerState> for DriverWorkerTask<T> {
                     Some(NvmeWorkerRequest::Save(rpc)) => {
                         rpc.handle(async |_| self.save(state).await).await
                     }
+                    Some(NvmeWorkerRequest::QueryAllocatorFallback(rpc)) => {
+                        rpc.complete(state.fallback_used)
+                    }
                     None => break,
                 }
             }

From b5a2ba1ddaf4f7e203de5fb35d84eb74740a1e3d Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Thu, 24 Apr 2025 18:36:48 +0000
Subject: [PATCH 14/18] Validated fallback allocator for NVMe, more tracing

---
 openhcl/underhill_core/src/dispatch/mod.rs    | 11 ++---
 openhcl/underhill_core/src/nvme_manager.rs    | 40 ++++++++++++++-----
 .../disk_nvme/nvme_driver/src/driver.rs       | 30 ++++++--------
 .../disk_nvme/nvme_driver/src/queue_pair.rs   | 32 +++++++++------
 4 files changed, 67 insertions(+), 46 deletions(-)

diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs
index 7cf443ff32..6c49dae1a2 100644
--- a/openhcl/underhill_core/src/dispatch/mod.rs
+++ b/openhcl/underhill_core/src/dispatch/mod.rs
@@ -494,13 +494,14 @@ impl LoadedVm {
 
         // NOTE: This is set via the corresponding env arg, as this feature is
         // experimental.
-        let nvme_keepalive_runtime_enabled = if let Some(nvme_manager) = self.nvme_manager.as_ref() {
-            nvme_manager.is_keepalive_still_enabled().await
+        let nvme_keepalive_runtime = if let Some(nvme_manager) = self.nvme_manager.as_ref() {
+            nvme_manager.query_keepalive_runtime_status().await
         } else {
             false
         };
-        let nvme_keepalive = self.nvme_keepalive && capabilities_flags.enable_nvme_keepalive() && nvme_keepalive_runtime_enabled;
-        tracing::info!("YSP: {nvme_keepalive_runtime_enabled} {nvme_keepalive}");
+        let nvme_keepalive = self.nvme_keepalive
+            && capabilities_flags.enable_nvme_keepalive()
+            && nvme_keepalive_runtime;
 
         // Do everything before the log flush under a span.
         let r = async {
@@ -533,7 +534,7 @@ impl LoadedVm {
                 if let Some(nvme_manager) = self.nvme_manager.take() {
                     nvme_manager
                         .shutdown(nvme_keepalive)
-                        .instrument(tracing::info_span!("shutdown_nvme_vfio", %correlation_id, %nvme_keepalive))
+                        .instrument(tracing::info_span!("shutdown_nvme_vfio", %correlation_id, %nvme_keepalive, %nvme_keepalive_runtime))
                         .await;
                 }
             };
diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs
index 6f61f8bc4d..4144b66a7c 100644
--- a/openhcl/underhill_core/src/nvme_manager.rs
+++ b/openhcl/underhill_core/src/nvme_manager.rs
@@ -129,9 +129,18 @@ impl NvmeManager {
     }
 
     /// Save could have been disabled if fallback allocator was used.
-    pub async fn is_keepalive_still_enabled(&self) -> bool {
-        let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await {
-            Ok(s) => s,
+    pub async fn query_keepalive_runtime_status(&self) -> bool {
+        let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await
+        {
+            Ok(s) => {
+                if s.fallback_alloc > 0 {
+                    tracing::warn!(
+                        mem_size = s.fallback_alloc,
+                        "fallback mem allocator was used"
+                    );
+                }
+                s.nvme_keepalive
+            }
             Err(_) => false,
         };
         self.save_restore_supported && worker_save_restore
@@ -177,6 +186,13 @@ impl NvmeManager {
     }
 }
 
+pub struct NvmeKeepaliveRuntimeStatus {
+    /// Indicates if keepalive still enabled.
+    pub nvme_keepalive: bool,
+    /// How much memory (bytes) was allocated by fallback allocator.
+    pub fallback_alloc: u64,
+}
+
 enum Request {
     Inspect(inspect::Deferred),
     ForceLoadDriver(inspect::DeferredUpdate),
@@ -186,7 +202,7 @@ enum Request {
         span: tracing::Span,
         nvme_keepalive: bool,
     },
-    KeepAliveStatus(Rpc<(), bool>),
+    KeepAliveStatus(Rpc<(), NvmeKeepaliveRuntimeStatus>),
 }
 
 #[derive(Debug, Clone)]
@@ -281,17 +297,19 @@ impl NvmeManagerWorker {
                     break (span, do_not_reset);
                 }
                 Request::KeepAliveStatus(rpc) => {
-                    let mut fallback_used = false;
+                    let mut fallback_alloc = 0u64;
                     for (_s, dev) in self.devices.iter_mut() {
-                        // Prevent devices from originating controller reset in drop().
-                        fallback_used |= dev.query_fallback_used().await;
+                        fallback_alloc += dev.query_fallback_alloc().await;
                     }
-                    if fallback_used {
+                    if fallback_alloc > 0 {
                         // If any of the attached devices ever used fallback allocator,
-                        // update internal tracking as well, and return the result.
+                        // update internal tracking and return the result.
                         self.save_restore_supported = false;
                     }
-                    rpc.complete(self.save_restore_supported);
+                    rpc.complete(NvmeKeepaliveRuntimeStatus {
+                        nvme_keepalive: self.save_restore_supported,
+                        fallback_alloc,
+                    });
                 }
             }
         };
@@ -334,7 +352,7 @@ impl NvmeManagerWorker {
                 // Create a fallback allocator which uses heap.
                 // When fallback allocator is involved, nvme_keepalive
                 // will be implicitly disabled.
-                let fallback_dma_client = if self.save_restore_supported {
+                let fallback_dma_client = if self.save_restore_supported && !self.is_isolated {
                     let client: Arc<dyn DmaClient> = self
                         .dma_client_spawner
                         .new_client(DmaClientParameters {
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
index 023631657f..de94ccc423 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
@@ -92,8 +92,8 @@ struct WorkerState {
     qsize: u16,
     #[inspect(skip)]
     async_event_task: Task<()>,
-    /// Tracks if fallback memory allocator was ever used.
-    fallback_used: bool,
+    /// Tracks how much memory was allocated using fallback allocator.
+    fallback_alloc: u64,
 }
 
 /// An error restoring from saved state.
@@ -163,8 +163,8 @@ enum NvmeWorkerRequest {
     CreateIssuer(Rpc<u32, ()>),
     /// Save worker state.
     Save(Rpc<(), anyhow::Result<NvmeDriverWorkerSavedState>>),
-    /// Query worker state.
-    QueryAllocatorFallback(Rpc<(), bool>),
+    /// Query how much memory was allocated with fallback allocator.
+    QueryAllocatorFallback(Rpc<(), u64>),
 }
 
 impl<T: DeviceBacking> NvmeDriver<T> {
@@ -292,11 +292,10 @@ impl<T: DeviceBacking> NvmeDriver<T> {
         )
         .context("failed to create admin queue pair")?;
 
-        let fallback_used = admin.fallback_used();
-        if fallback_used {
+        let fallback_alloc = admin.fallback_alloc();
+        if fallback_alloc > 0 {
             // Unconditionally override keepalive if fallback allocator was involved.
             self.nvme_keepalive = false;
-            tracing::info!("YSP: nvme_keepalive explicitly disabled");
         }
         let admin = worker.admin.insert(admin);
 
@@ -437,7 +436,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             qsize,
             async_event_task,
             max_io_queues,
-            fallback_used, // May be updated in create_io_queue() below.
+            fallback_alloc, // May be updated in create_io_queue() below.
         };
 
         self.admin = Some(admin.issuer().clone());
@@ -646,7 +645,7 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             qsize: saved_state.worker_data.qsize,
             async_event_task,
             max_io_queues: saved_state.worker_data.max_io_queues,
-            fallback_used: false, // Because this is restore() which implies that everything was saved.
+            fallback_alloc: 0, // This is restore() which implies that everything was saved.
         };
 
         this.admin = Some(admin.issuer().clone());
@@ -707,18 +706,13 @@ impl<T: DeviceBacking> NvmeDriver<T> {
     }
 
     /// Queries worker task if memory allocator ever fell back.
-    pub async fn query_fallback_used(&self) -> bool {
+    pub async fn query_fallback_alloc(&self) -> u64 {
         let fb = self
             .io_issuers
             .send
             .call(NvmeWorkerRequest::QueryAllocatorFallback, ())
             .await;
-        match fb {
-            Ok(fallback_used) => fallback_used,
-            Err(_) => {
-                false
-            }
-        }
+        fb.unwrap_or_default()
     }
 }
 
@@ -820,7 +814,7 @@ impl<T: DeviceBacking> AsyncRun<WorkerState> for DriverWorkerTask<T> {
                         rpc.handle(async |_| self.save(state).await).await
                     }
                     Some(NvmeWorkerRequest::QueryAllocatorFallback(rpc)) => {
-                        rpc.complete(state.fallback_used)
+                        rpc.complete(state.fallback_alloc)
                     }
                     None => break,
                 }
@@ -899,7 +893,7 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
         )
         .with_context(|| format!("failed to create io queue pair {qid}"))?;
         // Keep tracking if fallback allocator was ever used.
-        state.fallback_used |= queue.fallback_used();
+        state.fallback_alloc += queue.fallback_alloc();
 
         let io_sq_addr = queue.sq_addr();
         let io_cq_addr = queue.cq_addr();
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
index b52a6dc3b0..4f283747ef 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
@@ -48,7 +48,7 @@ const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1);
 /// Memory allocator errors.
 #[derive(Debug, Error)]
 pub enum AllocatorError {
-    #[error("no fallback allocator")]
+    #[error("no fallback mem allocator")]
     NoFallbackAllocator,
 }
 
@@ -60,7 +60,7 @@ pub(crate) struct QueuePair {
     qid: u16,
     sq_entries: u16,
     cq_entries: u16,
-    fallback_alloc_used: bool,
+    fallback_alloc: u64,
 }
 
 impl Inspect for QueuePair {
@@ -73,7 +73,7 @@ impl Inspect for QueuePair {
             qid: _,
             sq_entries: _,
             cq_entries: _,
-            fallback_alloc_used: _,
+            fallback_alloc: _,
         } = self;
         issuer.send.send(Req::Inspect(req.defer()));
     }
@@ -193,7 +193,7 @@ impl QueuePair {
         assert!(sq_entries <= Self::MAX_SQ_ENTRIES);
         assert!(cq_entries <= Self::MAX_CQ_ENTRIES);
 
-        let mut fallback_involved = false;
+        let mut fallback_alloc = 0u64;
         let total_size =
             QueuePair::SQ_SIZE + QueuePair::CQ_SIZE + QueuePair::PER_QUEUE_PAGES * PAGE_SIZE;
         let dma_client = device.dma_client();
@@ -201,18 +201,26 @@ impl QueuePair {
             .allocate_dma_buffer(total_size)
             .context("failed to allocate memory for the queues")
             .or_else(|_| {
-                fallback_involved = true;
+                fallback_alloc += total_size as u64;
                 device.fallback_dma_client().map_or(
                     Err(AllocatorError::NoFallbackAllocator.into()),
                     |f| {
                         f.allocate_dma_buffer(total_size)
-                            .context("fallback allocator also failed")
+                            .context("fallback mem allocator also failed")
                     },
                 )
             });
 
         QueuePair::new_or_restore(
-            spawner, qid, sq_entries, cq_entries, interrupt, registers, mem?, None, fallback_involved,
+            spawner,
+            qid,
+            sq_entries,
+            cq_entries,
+            interrupt,
+            registers,
+            mem?,
+            None,
+            fallback_alloc,
         )
     }
 
@@ -226,7 +234,7 @@ impl QueuePair {
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
         mem: MemoryBlock,
         saved_state: Option<&QueueHandlerSavedState>,
-        fallback_alloc_used: bool,
+        fallback_alloc: u64,
     ) -> anyhow::Result<Self> {
         // MemoryBlock is either allocated or restored prior calling here.
         let sq_mem_block = mem.subblock(0, QueuePair::SQ_SIZE);
@@ -276,7 +284,7 @@ impl QueuePair {
             qid,
             sq_entries,
             cq_entries,
-            fallback_alloc_used,
+            fallback_alloc,
         })
     }
 
@@ -298,8 +306,8 @@ impl QueuePair {
     }
 
     /// Indicates that fallback allocator was used for this queue pair.
-    pub fn fallback_used(&self) -> bool {
-        self.fallback_alloc_used
+    pub fn fallback_alloc(&self) -> u64 {
+        self.fallback_alloc
     }
 
     /// Save queue pair state for servicing.
@@ -348,7 +356,7 @@ impl QueuePair {
             registers,
             mem,
             Some(handler_data),
-            false,
+            0u64,
         )
     }
 }

From e4e7ffc519d5807e12d9b7138d6c60a3cfbce828 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Fri, 25 Apr 2025 02:58:35 +0000
Subject: [PATCH 15/18] Move fallback logic to dma clients

---
 Cargo.lock                                    |   2 +
 .../lower_vtl_permissions_guard/src/lib.rs    |  15 ++
 openhcl/openhcl_dma_manager/Cargo.toml        |   2 +
 openhcl/openhcl_dma_manager/src/lib.rs        |  95 +++++++++++-
 openhcl/underhill_core/src/dispatch/mod.rs    |   7 +
 openhcl/underhill_core/src/emuplat/netvsp.rs  |   2 +-
 openhcl/underhill_core/src/nvme_manager.rs    | 136 ++++++++++--------
 openhcl/underhill_core/src/worker.rs          |  87 ++++++-----
 .../nvme_driver/fuzz/fuzz_emulated_device.rs  |   5 -
 .../disk_nvme/nvme_driver/src/driver.rs       |  30 ++--
 .../disk_nvme/nvme_driver/src/queue_pair.rs   |  40 +-----
 .../disk_nvme/nvme_driver/src/tests.rs        |   5 -
 vm/devices/user_driver/src/lib.rs             |  20 ++-
 vm/devices/user_driver/src/lockmem.rs         |  15 ++
 vm/devices/user_driver/src/vfio.rs            |  17 +--
 .../user_driver_emulated_mock/src/lib.rs      |   5 -
 vm/page_pool_alloc/src/lib.rs                 |  20 ++-
 17 files changed, 313 insertions(+), 190 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 13cbe2d9d2..ab0f6572ad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4805,6 +4805,8 @@ dependencies = [
  "memory_range",
  "mesh",
  "page_pool_alloc",
+ "parking_lot",
+ "thiserror 2.0.12",
  "user_driver",
  "virt",
  "vmcore",
diff --git a/openhcl/lower_vtl_permissions_guard/src/lib.rs b/openhcl/lower_vtl_permissions_guard/src/lib.rs
index a8678fbc71..1b2c08f048 100644
--- a/openhcl/lower_vtl_permissions_guard/src/lib.rs
+++ b/openhcl/lower_vtl_permissions_guard/src/lib.rs
@@ -108,4 +108,19 @@ impl<T: DmaClient> DmaClient for LowerVtlMemorySpawner<T> {
     fn attach_pending_buffers(&self) -> Result<Vec<MemoryBlock>> {
         anyhow::bail!("restore is not supported for LowerVtlMemorySpawner")
     }
+
+    /// Query if this client supports persistent allocations.
+    fn is_persistent(&self) -> bool {
+        false
+    }
+
+    /// How much memory was allocated during session.
+    fn alloc_size(&self) -> u64 {
+        0
+    }
+
+    /// How much backup memory was allocated during session (fallback).
+    fn fallback_alloc_size(&self) -> u64 {
+        0
+    }
 }
diff --git a/openhcl/openhcl_dma_manager/Cargo.toml b/openhcl/openhcl_dma_manager/Cargo.toml
index 97b9821055..6e61581bf5 100644
--- a/openhcl/openhcl_dma_manager/Cargo.toml
+++ b/openhcl/openhcl_dma_manager/Cargo.toml
@@ -16,6 +16,8 @@ inspect.workspace = true
 memory_range.workspace = true
 mesh.workspace = true
 page_pool_alloc.workspace = true
+parking_lot.workspace = true
+thiserror.workspace = true
 user_driver.workspace = true
 virt.workspace = true
 vmcore.workspace = true
diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs
index 64db4d2b0a..819817edc6 100644
--- a/openhcl/openhcl_dma_manager/src/lib.rs
+++ b/openhcl/openhcl_dma_manager/src/lib.rs
@@ -16,10 +16,21 @@ use memory_range::MemoryRange;
 use page_pool_alloc::PagePool;
 use page_pool_alloc::PagePoolAllocator;
 use page_pool_alloc::PagePoolAllocatorSpawner;
+use parking_lot::Mutex;
 use std::sync::Arc;
+use thiserror::Error;
 use user_driver::DmaClient;
+use user_driver::DmaClientAllocStats;
 use user_driver::lockmem::LockedMemorySpawner;
 
+/// DMA manager errors.
+#[derive(Debug, Error)]
+pub enum DmaManagerError {
+    /// No memory.
+    #[error("no memory")]
+    NoMemory,
+}
+
 /// Save restore support for [`OpenhclDmaManager`].
 pub mod save_restore {
     use super::OpenhclDmaManager;
@@ -189,7 +200,11 @@ impl virt::VtlMemoryProtection for DmaManagerLowerVtl {
 }
 
 impl DmaManagerInner {
-    fn new_dma_client(&self, params: DmaClientParameters) -> anyhow::Result<Arc<OpenhclDmaClient>> {
+    fn new_dma_client(
+        &self,
+        params: DmaClientParameters,
+        fallback: Option<Arc<OpenhclDmaClient>>,
+    ) -> anyhow::Result<Arc<OpenhclDmaClient>> {
         // Allocate the inner client that actually performs the allocations.
         let backing = {
             let DmaClientParameters {
@@ -297,7 +312,15 @@ impl DmaManagerInner {
             }
         };
 
-        Ok(Arc::new(OpenhclDmaClient { backing, params }))
+        Ok(Arc::new(OpenhclDmaClient {
+            backing,
+            params,
+            fallback,
+            inner_stats: Mutex::new(DmaClientAllocStats {
+                total_alloc: 0,
+                fallback_alloc: 0,
+            }),
+        }))
     }
 }
 
@@ -346,8 +369,17 @@ impl OpenhclDmaManager {
 
     /// Creates a new DMA client with the given device name and lower VTL
     /// policy.
-    pub fn new_client(&self, params: DmaClientParameters) -> anyhow::Result<Arc<OpenhclDmaClient>> {
-        self.inner.new_dma_client(params)
+    pub fn new_client(
+        &self,
+        params: DmaClientParameters,
+        fallback_params: Option<DmaClientParameters>,
+    ) -> anyhow::Result<Arc<OpenhclDmaClient>> {
+        let fb = if let Some(fb1) = fallback_params {
+            self.inner.new_dma_client(fb1, None).ok()
+        } else {
+            None
+        };
+        self.inner.new_dma_client(params, fb)
     }
 
     /// Returns a [`DmaClientSpawner`] for creating DMA clients.
@@ -385,8 +417,17 @@ pub struct DmaClientSpawner {
 
 impl DmaClientSpawner {
     /// Creates a new DMA client with the given parameters.
-    pub fn new_client(&self, params: DmaClientParameters) -> anyhow::Result<Arc<OpenhclDmaClient>> {
-        self.inner.new_dma_client(params)
+    pub fn new_client(
+        &self,
+        params: DmaClientParameters,
+        fallback_params: Option<DmaClientParameters>,
+    ) -> anyhow::Result<Arc<OpenhclDmaClient>> {
+        let fb = if let Some(fb1) = fallback_params {
+            self.inner.new_dma_client(fb1, None).ok()
+        } else {
+            None
+        };
+        self.inner.new_dma_client(params, fb)
     }
 }
 
@@ -429,6 +470,16 @@ impl DmaClientBacking {
             DmaClientBacking::LockedMemoryLowerVtl(spawner) => spawner.attach_pending_buffers(),
         }
     }
+
+    fn is_persistent(&self) -> bool {
+        match self {
+            DmaClientBacking::SharedPool(_allocator) => false,
+            DmaClientBacking::PrivatePool(_allocator) => true,
+            DmaClientBacking::LockedMemory(_spawner) => false,
+            DmaClientBacking::PrivatePoolLowerVtl(_spawner) => false,
+            DmaClientBacking::LockedMemoryLowerVtl(_spawner) => false,
+        }
+    }
 }
 
 /// An OpenHCL dma client. This client implements inspect to allow seeing what
@@ -437,6 +488,10 @@ impl DmaClientBacking {
 pub struct OpenhclDmaClient {
     backing: DmaClientBacking,
     params: DmaClientParameters,
+    #[inspect(skip)] // TODO: Skip for now
+    /// Allocation statistics per client.
+    inner_stats: Mutex<DmaClientAllocStats>,
+    fallback: Option<Arc<OpenhclDmaClient>>,
 }
 
 impl DmaClient for OpenhclDmaClient {
@@ -444,10 +499,36 @@ impl DmaClient for OpenhclDmaClient {
         &self,
         total_size: usize,
     ) -> anyhow::Result<user_driver::memory::MemoryBlock> {
-        self.backing.allocate_dma_buffer(total_size)
+        let mut stats = self.inner_stats.lock();
+        stats.total_alloc += total_size as u64;
+        let mem_block = self.backing.allocate_dma_buffer(total_size).or_else(|_| {
+            stats.fallback_alloc += total_size as u64;
+            self.fallback
+                .as_ref()
+                .map_or(Err(DmaManagerError::NoMemory.into()), |f| {
+                    f.allocate_dma_buffer(total_size)
+                })
+        });
+
+        mem_block
     }
 
     fn attach_pending_buffers(&self) -> anyhow::Result<Vec<user_driver::memory::MemoryBlock>> {
         self.backing.attach_pending_buffers()
     }
+
+    /// Query if this client supports persistent allocations.
+    fn is_persistent(&self) -> bool {
+        self.backing.is_persistent()
+    }
+
+    /// How much memory was allocated during session.
+    fn alloc_size(&self) -> u64 {
+        self.inner_stats.lock().total_alloc
+    }
+
+    /// How much backup memory was allocated during session (fallback).
+    fn fallback_alloc_size(&self) -> u64 {
+        self.inner_stats.lock().fallback_alloc
+    }
 }
diff --git a/openhcl/underhill_core/src/dispatch/mod.rs b/openhcl/underhill_core/src/dispatch/mod.rs
index 6c49dae1a2..152637070b 100644
--- a/openhcl/underhill_core/src/dispatch/mod.rs
+++ b/openhcl/underhill_core/src/dispatch/mod.rs
@@ -499,6 +499,13 @@ impl LoadedVm {
         } else {
             false
         };
+        // Three sources to determine if keepalive can be enabled:
+        // 1. Host indicates that it is compatible with keepalive
+        //    by setting device tree property when VM starts.
+        // 2. During servicing the capabilities_flags is also set
+        //    so OpenHCL knows that it wasn't migrated to an older host.
+        // 3. If we ran out of dedicated DMA memory and used non-persistent
+        //    fallback allocator, disable keepalive altogether.
         let nvme_keepalive = self.nvme_keepalive
             && capabilities_flags.enable_nvme_keepalive()
             && nvme_keepalive_runtime;
diff --git a/openhcl/underhill_core/src/emuplat/netvsp.rs b/openhcl/underhill_core/src/emuplat/netvsp.rs
index 9398e0fe21..bf517208fb 100644
--- a/openhcl/underhill_core/src/emuplat/netvsp.rs
+++ b/openhcl/underhill_core/src/emuplat/netvsp.rs
@@ -120,7 +120,7 @@ async fn try_create_mana_device(
     max_sub_channels: u16,
     dma_client: Arc<dyn DmaClient>,
 ) -> anyhow::Result<ManaDevice<VfioDevice>> {
-    let device = VfioDevice::new(driver_source, pci_id, dma_client, None)
+    let device = VfioDevice::new(driver_source, pci_id, dma_client)
         .await
         .context("failed to open device")?;
 
diff --git a/openhcl/underhill_core/src/nvme_manager.rs b/openhcl/underhill_core/src/nvme_manager.rs
index 4144b66a7c..de8037fef4 100644
--- a/openhcl/underhill_core/src/nvme_manager.rs
+++ b/openhcl/underhill_core/src/nvme_manager.rs
@@ -30,6 +30,7 @@ use std::sync::Arc;
 use thiserror::Error;
 use tracing::Instrument;
 use user_driver::DmaClient;
+use user_driver::DmaClientAllocStats;
 use user_driver::vfio::VfioDevice;
 use vm_resource::AsyncResolveResource;
 use vm_resource::ResourceId;
@@ -133,9 +134,9 @@ impl NvmeManager {
         let worker_save_restore = match self.client.sender.call(Request::KeepAliveStatus, ()).await
         {
             Ok(s) => {
-                if s.fallback_alloc > 0 {
+                if s.stats.fallback_alloc > 0 {
                     tracing::warn!(
-                        mem_size = s.fallback_alloc,
+                        mem_size = s.stats.fallback_alloc,
                         "fallback mem allocator was used"
                     );
                 }
@@ -187,10 +188,10 @@ impl NvmeManager {
 }
 
 pub struct NvmeKeepaliveRuntimeStatus {
-    /// Indicates if keepalive still enabled.
+    /// Indicates if keepalive is still enabled.
     pub nvme_keepalive: bool,
-    /// How much memory (bytes) was allocated by fallback allocator.
-    pub fallback_alloc: u64,
+    /// Retrieve statistics from connected DMA client.
+    pub stats: DmaClientAllocStats,
 }
 
 enum Request {
@@ -297,18 +298,23 @@ impl NvmeManagerWorker {
                     break (span, do_not_reset);
                 }
                 Request::KeepAliveStatus(rpc) => {
-                    let mut fallback_alloc = 0u64;
+                    let mut stats = DmaClientAllocStats {
+                        total_alloc: 0,
+                        fallback_alloc: 0,
+                    };
                     for (_s, dev) in self.devices.iter_mut() {
-                        fallback_alloc += dev.query_fallback_alloc().await;
+                        let dev_stats = dev.get_alloc_stats().await;
+                        stats.total_alloc += dev_stats.total_alloc;
+                        stats.fallback_alloc += dev_stats.fallback_alloc;
                     }
-                    if fallback_alloc > 0 {
+                    if stats.fallback_alloc > 0 {
                         // If any of the attached devices ever used fallback allocator,
                         // update internal tracking and return the result.
                         self.save_restore_supported = false;
                     }
                     rpc.complete(NvmeKeepaliveRuntimeStatus {
                         nvme_keepalive: self.save_restore_supported,
-                        fallback_alloc,
+                        stats,
                     });
                 }
             }
@@ -341,51 +347,45 @@ impl NvmeManagerWorker {
             hash_map::Entry::Occupied(entry) => entry.into_mut(),
             hash_map::Entry::Vacant(entry) => {
                 let device_name = format!("nvme_{}", pci_id);
+                let lower_vtl_policy = LowerVtlPermissionPolicy::Any;
                 let allocation_visibility = if self.is_isolated {
                     AllocationVisibility::Shared
                 } else {
                     AllocationVisibility::Private
                 };
-                let lower_vtl_policy = LowerVtlPermissionPolicy::Any;
+
+                // Main client parameters.
+                let main_params = DmaClientParameters {
+                    device_name: device_name.clone(),
+                    lower_vtl_policy: lower_vtl_policy.clone(),
+                    allocation_visibility,
+                    persistent_allocations: self.save_restore_supported,
+                };
 
                 // Persistent allocations use fixed size memory.
                 // Create a fallback allocator which uses heap.
                 // When fallback allocator is involved, nvme_keepalive
                 // will be implicitly disabled.
-                let fallback_dma_client = if self.save_restore_supported && !self.is_isolated {
-                    let client: Arc<dyn DmaClient> = self
-                        .dma_client_spawner
-                        .new_client(DmaClientParameters {
-                            device_name: device_name.clone(),
-                            lower_vtl_policy: lower_vtl_policy.clone(),
-                            allocation_visibility,
-                            persistent_allocations: false,
-                        })
-                        .map_err(InnerError::DmaClient)?;
-                    Some(client)
+                let fallback_params = if self.save_restore_supported && !self.is_isolated {
+                    Some(DmaClientParameters {
+                        device_name,
+                        lower_vtl_policy,
+                        allocation_visibility,
+                        persistent_allocations: false,
+                    })
                 } else {
                     None
                 };
 
                 let dma_client = self
                     .dma_client_spawner
-                    .new_client(DmaClientParameters {
-                        device_name,
-                        lower_vtl_policy,
-                        allocation_visibility,
-                        persistent_allocations: self.save_restore_supported,
-                    })
+                    .new_client(main_params, fallback_params)
                     .map_err(InnerError::DmaClient)?;
 
-                let device = VfioDevice::new(
-                    &self.driver_source,
-                    entry.key(),
-                    dma_client,
-                    fallback_dma_client,
-                )
-                .instrument(tracing::info_span!("vfio_device_open", pci_id))
-                .await
-                .map_err(InnerError::Vfio)?;
+                let device = VfioDevice::new(&self.driver_source, entry.key(), dma_client)
+                    .instrument(tracing::info_span!("vfio_device_open", pci_id))
+                    .await
+                    .map_err(InnerError::Vfio)?;
 
                 let driver =
                     nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device)
@@ -414,6 +414,45 @@ impl NvmeManagerWorker {
             .map_err(|source| InnerError::Namespace { nsid, source })
     }
 
+    /// Copy of the code from get_driver.
+    fn get_dma_client(&self, pci_id: String) -> Result<Arc<dyn DmaClient>, InnerError> {
+        let device_name = format!("nvme_{}", pci_id);
+        let lower_vtl_policy = LowerVtlPermissionPolicy::Any;
+        let allocation_visibility = if self.is_isolated {
+            AllocationVisibility::Shared
+        } else {
+            AllocationVisibility::Private
+        };
+
+        // Main client parameters.
+        let main_params = DmaClientParameters {
+            device_name: device_name.clone(),
+            lower_vtl_policy: lower_vtl_policy.clone(),
+            allocation_visibility,
+            persistent_allocations: self.save_restore_supported,
+        };
+
+        // Persistent allocations use fixed size memory.
+        // Create a fallback allocator which uses heap.
+        // When fallback allocator is involved, nvme_keepalive
+        // will be implicitly disabled.
+        let fallback_params = if self.save_restore_supported && !self.is_isolated {
+            Some(DmaClientParameters {
+                device_name,
+                lower_vtl_policy,
+                allocation_visibility,
+                persistent_allocations: false,
+            })
+        } else {
+            None
+        };
+
+        Ok(self
+            .dma_client_spawner
+            .new_client(main_params, fallback_params)
+            .map_err(InnerError::DmaClient)?)
+    }
+
     /// Saves NVMe device's states into buffer during servicing.
     pub async fn save(&mut self) -> anyhow::Result<NvmeManagerSavedState> {
         let mut nvme_disks: Vec<NvmeSavedDiskConfig> = Vec::new();
@@ -438,30 +477,15 @@ impl NvmeManagerWorker {
         self.devices = HashMap::new();
         for disk in &saved_state.nvme_disks {
             let pci_id = disk.pci_id.clone();
-
-            let dma_client = self.dma_client_spawner.new_client(DmaClientParameters {
-                device_name: format!("nvme_{}", pci_id),
-                lower_vtl_policy: LowerVtlPermissionPolicy::Any,
-                allocation_visibility: if self.is_isolated {
-                    AllocationVisibility::Shared
-                } else {
-                    AllocationVisibility::Private
-                },
-                persistent_allocations: true,
-            })?;
+            let dma_client = self.get_dma_client(pci_id.clone())?;
 
             // This code can wait on each VFIO device until it is arrived.
             // A potential optimization would be to delay VFIO operation
             // until it is ready, but a redesign of VfioDevice is needed.
-            let vfio_device = VfioDevice::restore(
-                &self.driver_source,
-                &disk.pci_id.clone(),
-                true,
-                dma_client,
-                None,
-            )
-            .instrument(tracing::info_span!("vfio_device_restore", pci_id))
-            .await?;
+            let vfio_device =
+                VfioDevice::restore(&self.driver_source, &disk.pci_id.clone(), true, dma_client)
+                    .instrument(tracing::info_span!("vfio_device_restore", pci_id))
+                    .await?;
 
             let nvme_driver = nvme_driver::NvmeDriver::restore(
                 &self.driver_source,
diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs
index 3913e8e657..2fb8e487af 100644
--- a/openhcl/underhill_core/src/worker.rs
+++ b/openhcl/underhill_core/src/worker.rs
@@ -745,16 +745,19 @@ impl UhVmNetworkSettings {
             .unwrap_or(MAX_SUBCHANNELS_PER_VNIC)
             .min(vps_count as u16);
 
-        let dma_client = dma_client_spawner.new_client(DmaClientParameters {
-            device_name: format!("nic_{}", nic_config.pci_id),
-            lower_vtl_policy: LowerVtlPermissionPolicy::Any,
-            allocation_visibility: if is_isolated {
-                AllocationVisibility::Shared
-            } else {
-                AllocationVisibility::Private
+        let dma_client = dma_client_spawner.new_client(
+            DmaClientParameters {
+                device_name: format!("nic_{}", nic_config.pci_id),
+                lower_vtl_policy: LowerVtlPermissionPolicy::Any,
+                allocation_visibility: if is_isolated {
+                    AllocationVisibility::Shared
+                } else {
+                    AllocationVisibility::Private
+                },
+                persistent_allocations: false,
             },
-            persistent_allocations: false,
-        })?;
+            None,
+        )?;
 
         let (vf_manager, endpoints, save_state) = HclNetworkVFManager::new(
             nic_config.instance_id,
@@ -1556,16 +1559,19 @@ async fn new_underhill_vm(
     if !matches!(isolation, virt::IsolationType::Vbs) {
         get_client.set_gpa_allocator(
             dma_manager
-                .new_client(DmaClientParameters {
-                    device_name: "get".into(),
-                    lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0,
-                    allocation_visibility: if isolation.is_isolated() {
-                        AllocationVisibility::Shared
-                    } else {
-                        AllocationVisibility::Private
+                .new_client(
+                    DmaClientParameters {
+                        device_name: "get".into(),
+                        lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0,
+                        allocation_visibility: if isolation.is_isolated() {
+                            AllocationVisibility::Shared
+                        } else {
+                            AllocationVisibility::Private
+                        },
+                        persistent_allocations: false,
                     },
-                    persistent_allocations: false,
-                })
+                    None,
+                )
                 .context("get dma client")?,
         );
     }
@@ -1720,18 +1726,24 @@ async fn new_underhill_vm(
         Some(virt_mshv_vtl::CvmLateParams {
             shared_gm: cvm_mem.shared_gm.clone(),
             isolated_memory_protector: cvm_mem.protector.clone(),
-            shared_dma_client: dma_manager.new_client(DmaClientParameters {
-                device_name: "partition-shared".into(),
-                lower_vtl_policy: LowerVtlPermissionPolicy::Any,
-                allocation_visibility: AllocationVisibility::Shared,
-                persistent_allocations: false,
-            })?,
-            private_dma_client: dma_manager.new_client(DmaClientParameters {
-                device_name: "partition-private".into(),
-                lower_vtl_policy: LowerVtlPermissionPolicy::Any,
-                allocation_visibility: AllocationVisibility::Private,
-                persistent_allocations: false,
-            })?,
+            shared_dma_client: dma_manager.new_client(
+                DmaClientParameters {
+                    device_name: "partition-shared".into(),
+                    lower_vtl_policy: LowerVtlPermissionPolicy::Any,
+                    allocation_visibility: AllocationVisibility::Shared,
+                    persistent_allocations: false,
+                },
+                None,
+            )?,
+            private_dma_client: dma_manager.new_client(
+                DmaClientParameters {
+                    device_name: "partition-private".into(),
+                    lower_vtl_policy: LowerVtlPermissionPolicy::Any,
+                    allocation_visibility: AllocationVisibility::Private,
+                    persistent_allocations: false,
+                },
+                None,
+            )?,
         })
     } else {
         None
@@ -2860,12 +2872,15 @@ async fn new_underhill_vm(
         let shutdown_guest = SimpleVmbusClientDeviceWrapper::new(
             driver_source.simple(),
             dma_manager
-                .new_client(DmaClientParameters {
-                    device_name: "shutdown-relay".into(),
-                    lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0,
-                    allocation_visibility: AllocationVisibility::Private,
-                    persistent_allocations: false,
-                })
+                .new_client(
+                    DmaClientParameters {
+                        device_name: "shutdown-relay".into(),
+                        lower_vtl_policy: LowerVtlPermissionPolicy::Vtl0,
+                        allocation_visibility: AllocationVisibility::Private,
+                        persistent_allocations: false,
+                    },
+                    None,
+                )
                 .context("shutdown relay dma client")?,
             shutdown_guest,
         )?;
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs
index 0e8e5c2b29..1463767ebb 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/fuzz/fuzz_emulated_device.rs
@@ -53,11 +53,6 @@ impl<T: 'static + Send + InspectMut + MmioIntercept, U: 'static + DmaClient> Dev
         self.device.dma_client()
     }
 
-    // Not implemented for Fuzzer yet.
-    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
-        None
-    }
-
     /// Arbitrarily decide to passthrough or return arbitrary value.
     fn max_interrupt_count(&self) -> u32 {
         // Case: Fuzz response
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
index de94ccc423..66bbe00994 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/driver.rs
@@ -34,6 +34,7 @@ use thiserror::Error;
 use tracing::Instrument;
 use tracing::info_span;
 use user_driver::DeviceBacking;
+use user_driver::DmaClientAllocStats;
 use user_driver::backoff::Backoff;
 use user_driver::interrupt::DeviceInterrupt;
 use user_driver::memory::MemoryBlock;
@@ -92,8 +93,6 @@ struct WorkerState {
     qsize: u16,
     #[inspect(skip)]
     async_event_task: Task<()>,
-    /// Tracks how much memory was allocated using fallback allocator.
-    fallback_alloc: u64,
 }
 
 /// An error restoring from saved state.
@@ -164,7 +163,7 @@ enum NvmeWorkerRequest {
     /// Save worker state.
     Save(Rpc<(), anyhow::Result<NvmeDriverWorkerSavedState>>),
     /// Query how much memory was allocated with fallback allocator.
-    QueryAllocatorFallback(Rpc<(), u64>),
+    QueryAllocatorStats(Rpc<(), DmaClientAllocStats>),
 }
 
 impl<T: DeviceBacking> NvmeDriver<T> {
@@ -292,11 +291,6 @@ impl<T: DeviceBacking> NvmeDriver<T> {
         )
         .context("failed to create admin queue pair")?;
 
-        let fallback_alloc = admin.fallback_alloc();
-        if fallback_alloc > 0 {
-            // Unconditionally override keepalive if fallback allocator was involved.
-            self.nvme_keepalive = false;
-        }
         let admin = worker.admin.insert(admin);
 
         // Register the admin queue with the controller.
@@ -436,7 +430,6 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             qsize,
             async_event_task,
             max_io_queues,
-            fallback_alloc, // May be updated in create_io_queue() below.
         };
 
         self.admin = Some(admin.issuer().clone());
@@ -645,7 +638,6 @@ impl<T: DeviceBacking> NvmeDriver<T> {
             qsize: saved_state.worker_data.qsize,
             async_event_task,
             max_io_queues: saved_state.worker_data.max_io_queues,
-            fallback_alloc: 0, // This is restore() which implies that everything was saved.
         };
 
         this.admin = Some(admin.issuer().clone());
@@ -706,13 +698,16 @@ impl<T: DeviceBacking> NvmeDriver<T> {
     }
 
     /// Queries worker task if memory allocator ever fell back.
-    pub async fn query_fallback_alloc(&self) -> u64 {
+    pub async fn get_alloc_stats(&self) -> DmaClientAllocStats {
         let fb = self
             .io_issuers
             .send
-            .call(NvmeWorkerRequest::QueryAllocatorFallback, ())
+            .call(NvmeWorkerRequest::QueryAllocatorStats, ())
             .await;
-        fb.unwrap_or_default()
+        fb.unwrap_or(DmaClientAllocStats {
+            total_alloc: 0,
+            fallback_alloc: 0,
+        })
     }
 }
 
@@ -813,8 +808,11 @@ impl<T: DeviceBacking> AsyncRun<WorkerState> for DriverWorkerTask<T> {
                     Some(NvmeWorkerRequest::Save(rpc)) => {
                         rpc.handle(async |_| self.save(state).await).await
                     }
-                    Some(NvmeWorkerRequest::QueryAllocatorFallback(rpc)) => {
-                        rpc.complete(state.fallback_alloc)
+                    Some(NvmeWorkerRequest::QueryAllocatorStats(rpc)) => {
+                        rpc.complete(DmaClientAllocStats {
+                            total_alloc: self.device.dma_client().alloc_size(),
+                            fallback_alloc: self.device.dma_client().fallback_alloc_size(),
+                        })
                     }
                     None => break,
                 }
@@ -892,8 +890,6 @@ impl<T: DeviceBacking> DriverWorkerTask<T> {
             self.registers.clone(),
         )
         .with_context(|| format!("failed to create io queue pair {qid}"))?;
-        // Keep tracking if fallback allocator was ever used.
-        state.fallback_alloc += queue.fallback_alloc();
 
         let io_sq_addr = queue.sq_addr();
         let io_cq_addr = queue.cq_addr();
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
index 4f283747ef..8b11e06a57 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/queue_pair.rs
@@ -45,13 +45,6 @@ use zerocopy::FromZeros;
 /// Value for unused PRP entries, to catch/mitigate buffer size mismatches.
 const INVALID_PAGE_ADDR: u64 = !(PAGE_SIZE as u64 - 1);
 
-/// Memory allocator errors.
-#[derive(Debug, Error)]
-pub enum AllocatorError {
-    #[error("no fallback mem allocator")]
-    NoFallbackAllocator,
-}
-
 pub(crate) struct QueuePair {
     task: Task<QueueHandler>,
     cancel: Cancel,
@@ -60,7 +53,6 @@ pub(crate) struct QueuePair {
     qid: u16,
     sq_entries: u16,
     cq_entries: u16,
-    fallback_alloc: u64,
 }
 
 impl Inspect for QueuePair {
@@ -73,7 +65,6 @@ impl Inspect for QueuePair {
             qid: _,
             sq_entries: _,
             cq_entries: _,
-            fallback_alloc: _,
         } = self;
         issuer.send.send(Req::Inspect(req.defer()));
     }
@@ -193,34 +184,15 @@ impl QueuePair {
         assert!(sq_entries <= Self::MAX_SQ_ENTRIES);
         assert!(cq_entries <= Self::MAX_CQ_ENTRIES);
 
-        let mut fallback_alloc = 0u64;
         let total_size =
             QueuePair::SQ_SIZE + QueuePair::CQ_SIZE + QueuePair::PER_QUEUE_PAGES * PAGE_SIZE;
         let dma_client = device.dma_client();
         let mem = dma_client
             .allocate_dma_buffer(total_size)
-            .context("failed to allocate memory for the queues")
-            .or_else(|_| {
-                fallback_alloc += total_size as u64;
-                device.fallback_dma_client().map_or(
-                    Err(AllocatorError::NoFallbackAllocator.into()),
-                    |f| {
-                        f.allocate_dma_buffer(total_size)
-                            .context("fallback mem allocator also failed")
-                    },
-                )
-            });
+            .context("failed to allocate memory for the queues")?;
 
         QueuePair::new_or_restore(
-            spawner,
-            qid,
-            sq_entries,
-            cq_entries,
-            interrupt,
-            registers,
-            mem?,
-            None,
-            fallback_alloc,
+            spawner, qid, sq_entries, cq_entries, interrupt, registers, mem, None,
         )
     }
 
@@ -234,7 +206,6 @@ impl QueuePair {
         registers: Arc<DeviceRegisters<impl DeviceBacking>>,
         mem: MemoryBlock,
         saved_state: Option<&QueueHandlerSavedState>,
-        fallback_alloc: u64,
     ) -> anyhow::Result<Self> {
         // MemoryBlock is either allocated or restored prior calling here.
         let sq_mem_block = mem.subblock(0, QueuePair::SQ_SIZE);
@@ -284,7 +255,6 @@ impl QueuePair {
             qid,
             sq_entries,
             cq_entries,
-            fallback_alloc,
         })
     }
 
@@ -305,11 +275,6 @@ impl QueuePair {
         self.task.await
     }
 
-    /// Indicates that fallback allocator was used for this queue pair.
-    pub fn fallback_alloc(&self) -> u64 {
-        self.fallback_alloc
-    }
-
     /// Save queue pair state for servicing.
     pub async fn save(&self) -> anyhow::Result<QueuePairSavedState> {
         // Return error if the queue does not have any memory allocated.
@@ -356,7 +321,6 @@ impl QueuePair {
             registers,
             mem,
             Some(handler_data),
-            0u64,
         )
     }
 }
diff --git a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
index 16b827c46f..afaef105d7 100644
--- a/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
+++ b/vm/devices/storage/disk_nvme/nvme_driver/src/tests.rs
@@ -366,11 +366,6 @@ impl<T: 'static + Send + InspectMut + MmioIntercept, U: 'static + DmaClient> Dev
         self.device.dma_client()
     }
 
-    // TODO: We should add one for testing.
-    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
-        None
-    }
-
     fn max_interrupt_count(&self) -> u32 {
         self.device.max_interrupt_count()
     }
diff --git a/vm/devices/user_driver/src/lib.rs b/vm/devices/user_driver/src/lib.rs
index 1461c382e8..7724fdd2bb 100644
--- a/vm/devices/user_driver/src/lib.rs
+++ b/vm/devices/user_driver/src/lib.rs
@@ -33,9 +33,6 @@ pub trait DeviceBacking: 'static + Send + Inspect {
     /// DMA Client for the device.
     fn dma_client(&self) -> Arc<dyn DmaClient>;
 
-    /// Fallback DMA Client for the device.
-    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>>;
-
     /// Returns the maximum number of interrupts that can be mapped.
     fn max_interrupt_count(&self) -> u32;
 
@@ -72,4 +69,21 @@ pub trait DmaClient: Send + Sync + Inspect {
 
     /// Attach all previously allocated memory blocks.
     fn attach_pending_buffers(&self) -> anyhow::Result<Vec<MemoryBlock>>;
+
+    /// Query if this client supports persistent allocations.
+    fn is_persistent(&self) -> bool;
+
+    /// How much memory was allocated during session.
+    fn alloc_size(&self) -> u64;
+
+    /// How much backup memory was allocated during session (fallback).
+    fn fallback_alloc_size(&self) -> u64;
+}
+
+/// DMA allocator statistics per client.
+pub struct DmaClientAllocStats {
+    /// How much memory (bytes) was allocated by a main allocator.
+    pub total_alloc: u64,
+    /// How much memory (bytes) was allocated by a fallback allocator.
+    pub fallback_alloc: u64,
 }
diff --git a/vm/devices/user_driver/src/lockmem.rs b/vm/devices/user_driver/src/lockmem.rs
index b7cb190630..154fb90fb6 100644
--- a/vm/devices/user_driver/src/lockmem.rs
+++ b/vm/devices/user_driver/src/lockmem.rs
@@ -134,4 +134,19 @@ impl crate::DmaClient for LockedMemorySpawner {
     fn attach_pending_buffers(&self) -> anyhow::Result<Vec<crate::memory::MemoryBlock>> {
         anyhow::bail!("restore not supported for lockmem")
     }
+
+    /// Query if this client supports persistent allocations.
+    fn is_persistent(&self) -> bool {
+        false
+    }
+
+    /// How much memory was allocated during session.
+    fn alloc_size(&self) -> u64 {
+        0
+    }
+
+    /// How much backup memory was allocated during session (fallback).
+    fn fallback_alloc_size(&self) -> u64 {
+        0
+    }
 }
diff --git a/vm/devices/user_driver/src/vfio.rs b/vm/devices/user_driver/src/vfio.rs
index 137d1db334..4151c56079 100644
--- a/vm/devices/user_driver/src/vfio.rs
+++ b/vm/devices/user_driver/src/vfio.rs
@@ -57,7 +57,6 @@ pub struct VfioDevice {
     #[inspect(skip)]
     config_space: vfio_sys::RegionInfo,
     dma_client: Arc<dyn DmaClient>,
-    fallback_dma_client: Option<Arc<dyn DmaClient>>,
 }
 
 #[derive(Inspect)]
@@ -75,16 +74,8 @@ impl VfioDevice {
         driver_source: &VmTaskDriverSource,
         pci_id: &str,
         dma_client: Arc<dyn DmaClient>,
-        fallback_dma_client: Option<Arc<dyn DmaClient>>,
     ) -> anyhow::Result<Self> {
-        Self::restore(
-            driver_source,
-            pci_id,
-            false,
-            dma_client,
-            fallback_dma_client,
-        )
-        .await
+        Self::restore(driver_source, pci_id, false, dma_client).await
     }
 
     /// Creates a new VFIO-backed device for the PCI device with `pci_id`.
@@ -94,7 +85,6 @@ impl VfioDevice {
         pci_id: &str,
         vf_keepalive: bool,
         dma_client: Arc<dyn DmaClient>,
-        fallback_dma_client: Option<Arc<dyn DmaClient>>,
     ) -> anyhow::Result<Self> {
         let path = Path::new("/sys/bus/pci/devices").join(pci_id);
 
@@ -141,7 +131,6 @@ impl VfioDevice {
             driver_source: driver_source.clone(),
             interrupts: Vec::new(),
             dma_client,
-            fallback_dma_client,
         };
 
         // Ensure bus master enable and memory space enable are set, and that
@@ -246,10 +235,6 @@ impl DeviceBacking for VfioDevice {
         self.dma_client.clone()
     }
 
-    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
-        self.fallback_dma_client.clone()
-    }
-
     fn max_interrupt_count(&self) -> u32 {
         self.msix_info.count
     }
diff --git a/vm/devices/user_driver_emulated_mock/src/lib.rs b/vm/devices/user_driver_emulated_mock/src/lib.rs
index 0ed650d957..fb68810cd5 100644
--- a/vm/devices/user_driver_emulated_mock/src/lib.rs
+++ b/vm/devices/user_driver_emulated_mock/src/lib.rs
@@ -162,11 +162,6 @@ impl<T: 'static + Send + InspectMut + MmioIntercept, U: 'static + Send + DmaClie
         self.dma_client.clone()
     }
 
-    // Not needed for Emulated.
-    fn fallback_dma_client(&self) -> Option<Arc<dyn DmaClient>> {
-        None
-    }
-
     fn max_interrupt_count(&self) -> u32 {
         self.controller.events.len() as u32
     }
diff --git a/vm/page_pool_alloc/src/lib.rs b/vm/page_pool_alloc/src/lib.rs
index c55ba44655..352c87cee7 100644
--- a/vm/page_pool_alloc/src/lib.rs
+++ b/vm/page_pool_alloc/src/lib.rs
@@ -657,6 +657,8 @@ pub struct PagePoolAllocator {
     inner: Arc<PagePoolInner>,
     #[inspect(skip)]
     device_id: usize,
+    /// Total alloc size in bytes for the duration.
+    alloc_size: Mutex<u64>,
 }
 
 impl PagePoolAllocator {
@@ -695,6 +697,7 @@ impl PagePoolAllocator {
         Ok(Self {
             inner: inner.clone(),
             device_id,
+            alloc_size: Mutex::new(0),
         })
     }
 
@@ -867,7 +870,7 @@ impl user_driver::DmaClient for PagePoolAllocator {
 
         let alloc = self
             .alloc(size_pages, "vfio dma".into())
-            .context("failed to allocate shared mem")?;
+            .context("failed to allocate from page pool")?;
 
         // The VfioDmaBuffer trait requires that newly allocated buffers are
         // zeroed.
@@ -883,6 +886,21 @@ impl user_driver::DmaClient for PagePoolAllocator {
             .map(|alloc| alloc.into_memory_block())
             .collect()
     }
+
+    /// Query if this client supports persistent allocations.
+    fn is_persistent(&self) -> bool {
+        true
+    }
+
+    /// How much memory was allocated during session.
+    fn alloc_size(&self) -> u64 {
+        0
+    }
+
+    /// How much backup memory was allocated during session (fallback).
+    fn fallback_alloc_size(&self) -> u64 {
+        0
+    }
 }
 
 #[cfg(test)]

From 727fd7dabd6fb6e00968f0a8291ed5a7e3353298 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Fri, 25 Apr 2025 16:35:11 +0000
Subject: [PATCH 16/18] Tracking and inspecting improvements

---
 Cargo.lock                                    |  1 +
 .../lower_vtl_permissions_guard/Cargo.toml    |  1 +
 .../lower_vtl_permissions_guard/src/lib.rs    |  8 +++++--
 openhcl/openhcl_dma_manager/src/lib.rs        |  5 ++--
 vm/devices/user_driver/src/lockmem.rs         | 24 +++++++++++++++----
 vm/page_pool_alloc/src/lib.rs                 |  7 +++---
 6 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ab0f6572ad..93fd5aed9b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3808,6 +3808,7 @@ dependencies = [
  "anyhow",
  "hvdef",
  "inspect",
+ "parking_lot",
  "user_driver",
  "virt",
 ]
diff --git a/openhcl/lower_vtl_permissions_guard/Cargo.toml b/openhcl/lower_vtl_permissions_guard/Cargo.toml
index d4e2b17496..809158724d 100644
--- a/openhcl/lower_vtl_permissions_guard/Cargo.toml
+++ b/openhcl/lower_vtl_permissions_guard/Cargo.toml
@@ -9,6 +9,7 @@ rust-version.workspace = true
 [target.'cfg(target_os = "linux")'.dependencies]
 hvdef.workspace = true
 inspect.workspace = true
+parking_lot.workspace = true
 user_driver.workspace = true
 virt.workspace = true
 
diff --git a/openhcl/lower_vtl_permissions_guard/src/lib.rs b/openhcl/lower_vtl_permissions_guard/src/lib.rs
index 1b2c08f048..616267cbba 100644
--- a/openhcl/lower_vtl_permissions_guard/src/lib.rs
+++ b/openhcl/lower_vtl_permissions_guard/src/lib.rs
@@ -13,6 +13,7 @@ pub use device_dma::LowerVtlDmaBuffer;
 use anyhow::Context;
 use anyhow::Result;
 use inspect::Inspect;
+use parking_lot::Mutex;
 use std::sync::Arc;
 use user_driver::DmaClient;
 use user_driver::memory::MemoryBlock;
@@ -79,6 +80,7 @@ pub struct LowerVtlMemorySpawner<T: DmaClient> {
     spawner: T,
     #[inspect(skip)]
     vtl_protect: Arc<dyn VtlMemoryProtection + Send + Sync>,
+    alloc_size: Mutex<u64>,
 }
 
 impl<T: DmaClient> LowerVtlMemorySpawner<T> {
@@ -88,6 +90,7 @@ impl<T: DmaClient> LowerVtlMemorySpawner<T> {
         Self {
             spawner,
             vtl_protect,
+            alloc_size: Mutex::new(0),
         }
     }
 }
@@ -98,6 +101,7 @@ impl<T: DmaClient> DmaClient for LowerVtlMemorySpawner<T> {
         let vtl_guard =
             PagesAccessibleToLowerVtl::new_from_pages(self.vtl_protect.clone(), mem.pfns())
                 .context("failed to lower VTL permissions on memory block")?;
+        *self.alloc_size.lock() += len as u64;
 
         Ok(MemoryBlock::new(LowerVtlDmaBuffer {
             block: mem,
@@ -116,10 +120,10 @@ impl<T: DmaClient> DmaClient for LowerVtlMemorySpawner<T> {
 
     /// How much memory was allocated during session.
     fn alloc_size(&self) -> u64 {
-        0
+        *self.alloc_size.lock()
     }
 
-    /// How much backup memory was allocated during session (fallback).
+    /// Not supported for this allocator.
     fn fallback_alloc_size(&self) -> u64 {
         0
     }
diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs
index 819817edc6..3b371cde45 100644
--- a/openhcl/openhcl_dma_manager/src/lib.rs
+++ b/openhcl/openhcl_dma_manager/src/lib.rs
@@ -298,13 +298,13 @@ impl DmaManagerInner {
                     LowerVtlPermissionPolicy::Any => {
                         // No persistence needed means the `LockedMemorySpawner`
                         // using normal VTL2 ram is fine.
-                        DmaClientBacking::LockedMemory(LockedMemorySpawner)
+                        DmaClientBacking::LockedMemory(LockedMemorySpawner::new())
                     }
                     LowerVtlPermissionPolicy::Vtl0 => {
                         // `LockedMemorySpawner` uses private VTL2 ram, so
                         // lowering VTL permissions is required.
                         DmaClientBacking::LockedMemoryLowerVtl(LowerVtlMemorySpawner::new(
-                            LockedMemorySpawner,
+                            LockedMemorySpawner::new(),
                             self.lower_vtl.clone(),
                         ))
                     }
@@ -499,6 +499,7 @@ impl DmaClient for OpenhclDmaClient {
         &self,
         total_size: usize,
     ) -> anyhow::Result<user_driver::memory::MemoryBlock> {
+        // The stats must be tracked here, not in the backing.
         let mut stats = self.inner_stats.lock();
         stats.total_alloc += total_size as u64;
         let mem_block = self.backing.allocate_dma_buffer(total_size).or_else(|_| {
diff --git a/vm/devices/user_driver/src/lockmem.rs b/vm/devices/user_driver/src/lockmem.rs
index 154fb90fb6..aa9fee6bf6 100644
--- a/vm/devices/user_driver/src/lockmem.rs
+++ b/vm/devices/user_driver/src/lockmem.rs
@@ -6,6 +6,7 @@
 use crate::memory::MappedDmaTarget;
 use anyhow::Context;
 use inspect::Inspect;
+use parking_lot::Mutex;
 use std::ffi::c_void;
 use std::fs::File;
 use std::io::Read;
@@ -123,12 +124,25 @@ unsafe impl MappedDmaTarget for LockedMemory {
     }
 }
 
-#[derive(Clone, Inspect)]
-pub struct LockedMemorySpawner;
+#[derive(Inspect)]
+pub struct LockedMemorySpawner {
+    alloc_size: Mutex<u64>,
+}
+
+impl LockedMemorySpawner {
+    /// Create a new [`LockedMemorySpawner`].
+    pub fn new() -> Self {
+        Self {
+            alloc_size: Mutex::new(0),
+        }
+    }
+}
 
 impl crate::DmaClient for LockedMemorySpawner {
     fn allocate_dma_buffer(&self, len: usize) -> anyhow::Result<crate::memory::MemoryBlock> {
-        Ok(crate::memory::MemoryBlock::new(LockedMemory::new(len)?))
+        let mem_block = crate::memory::MemoryBlock::new(LockedMemory::new(len)?);
+        *self.alloc_size.lock() += len as u64;
+        Ok(mem_block)
     }
 
     fn attach_pending_buffers(&self) -> anyhow::Result<Vec<crate::memory::MemoryBlock>> {
@@ -142,10 +156,10 @@ impl crate::DmaClient for LockedMemorySpawner {
 
     /// How much memory was allocated during session.
     fn alloc_size(&self) -> u64 {
-        0
+        *self.alloc_size.lock()
     }
 
-    /// How much backup memory was allocated during session (fallback).
+    /// Not supported for this allocator.
     fn fallback_alloc_size(&self) -> u64 {
         0
     }
diff --git a/vm/page_pool_alloc/src/lib.rs b/vm/page_pool_alloc/src/lib.rs
index 352c87cee7..8abaca0562 100644
--- a/vm/page_pool_alloc/src/lib.rs
+++ b/vm/page_pool_alloc/src/lib.rs
@@ -657,7 +657,7 @@ pub struct PagePoolAllocator {
     inner: Arc<PagePoolInner>,
     #[inspect(skip)]
     device_id: usize,
-    /// Total alloc size in bytes for the duration.
+    /// Total alloc size in bytes for the session duration.
     alloc_size: Mutex<u64>,
 }
 
@@ -871,6 +871,7 @@ impl user_driver::DmaClient for PagePoolAllocator {
         let alloc = self
             .alloc(size_pages, "vfio dma".into())
             .context("failed to allocate from page pool")?;
+        *self.alloc_size.lock() += len as u64;
 
         // The VfioDmaBuffer trait requires that newly allocated buffers are
         // zeroed.
@@ -894,10 +895,10 @@ impl user_driver::DmaClient for PagePoolAllocator {
 
     /// How much memory was allocated during session.
     fn alloc_size(&self) -> u64 {
-        0
+        *self.alloc_size.lock()
     }
 
-    /// How much backup memory was allocated during session (fallback).
+    /// Not supported for this allocator.
     fn fallback_alloc_size(&self) -> u64 {
         0
     }

From 09efefbafe7a31188865319ec3473debac88cad1 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Fri, 25 Apr 2025 20:08:40 +0000
Subject: [PATCH 17/18] Inform openhcl about self hint and print it

---
 openhcl/bootloader_fdt_parser/src/lib.rs    | 14 ++++++++++++++
 openhcl/openhcl_boot/src/dt.rs              |  5 +++++
 openhcl/openhcl_boot/src/host_params/dt.rs  |  5 +++++
 openhcl/openhcl_boot/src/host_params/mod.rs |  3 +++
 openhcl/openhcl_boot/src/main.rs            |  1 +
 openhcl/openhcl_dma_manager/src/lib.rs      | 14 ++++++++++++++
 openhcl/underhill_core/src/worker.rs        |  8 ++++++++
 vm/page_pool_alloc/src/lib.rs               |  7 +++++++
 8 files changed, 57 insertions(+)

diff --git a/openhcl/bootloader_fdt_parser/src/lib.rs b/openhcl/bootloader_fdt_parser/src/lib.rs
index 2b056ca72f..b6c04f6794 100644
--- a/openhcl/bootloader_fdt_parser/src/lib.rs
+++ b/openhcl/bootloader_fdt_parser/src/lib.rs
@@ -169,6 +169,8 @@ pub struct ParsedBootDtInfo {
     /// VTL2 range for private pool memory.
     #[inspect(iter_by_index)]
     pub private_pool_ranges: Vec<MemoryRangeWithNode>,
+    /// Source of DMA hint calculation.
+    pub dma_hint_self: bool,
 }
 
 fn err_to_owned(e: fdt::parser::Error<'_>) -> anyhow::Error {
@@ -207,6 +209,7 @@ struct OpenhclInfo {
     memory_allocation_mode: MemoryAllocationMode,
     isolation: IsolationType,
     private_pool_ranges: Vec<MemoryRangeWithNode>,
+    dma_hint_self: bool,
 }
 
 fn parse_memory_openhcl(node: &Node<'_>) -> anyhow::Result<AddressRange> {
@@ -394,6 +397,11 @@ fn parse_openhcl(node: &Node<'_>) -> anyhow::Result<OpenhclInfo> {
         .transpose()
         .context("unable to read vtl0-alias-map")?;
 
+    let dma_hint_self = matches!(
+        try_find_property(node, "dma-hint").and_then(|p| p.read_str().ok()),
+        Some("self")
+    );
+
     // Extract vmbus mmio information from the overall memory map.
     let vtl0_mmio = memory
         .iter()
@@ -416,6 +424,7 @@ fn parse_openhcl(node: &Node<'_>) -> anyhow::Result<OpenhclInfo> {
         memory_allocation_mode,
         isolation,
         private_pool_ranges,
+        dma_hint_self,
     })
 }
 
@@ -509,6 +518,7 @@ impl ParsedBootDtInfo {
         let mut isolation = IsolationType::None;
         let mut vtl2_reserved_range = MemoryRange::EMPTY;
         let mut private_pool_ranges = Vec::new();
+        let mut dma_hint_self = false;
 
         let parser = Parser::new(raw)
             .map_err(err_to_owned)
@@ -538,6 +548,7 @@ impl ParsedBootDtInfo {
                         memory_allocation_mode: n_memory_allocation_mode,
                         isolation: n_isolation,
                         private_pool_ranges: n_private_pool_ranges,
+                        dma_hint_self: n_dma_hint_self,
                     } = parse_openhcl(&child)?;
                     vtl0_mmio = n_vtl0_mmio;
                     config_ranges = n_config_ranges;
@@ -548,6 +559,7 @@ impl ParsedBootDtInfo {
                     isolation = n_isolation;
                     vtl2_reserved_range = n_vtl2_reserved_range;
                     private_pool_ranges = n_private_pool_ranges;
+                    dma_hint_self = n_dma_hint_self;
                 }
 
                 _ if child.name.starts_with("memory@") => {
@@ -580,6 +592,7 @@ impl ParsedBootDtInfo {
             isolation,
             vtl2_reserved_range,
             private_pool_ranges,
+            dma_hint_self,
         })
     }
 }
@@ -945,6 +958,7 @@ mod tests {
                 range: MemoryRange::new(0x60000..0x70000),
                 vnode: 0,
             }],
+            dma_hint_self: false,
         };
 
         let dt = build_dt(&orig_info).unwrap();
diff --git a/openhcl/openhcl_boot/src/dt.rs b/openhcl/openhcl_boot/src/dt.rs
index 3a85058c9c..40b16536df 100644
--- a/openhcl/openhcl_boot/src/dt.rs
+++ b/openhcl/openhcl_boot/src/dt.rs
@@ -483,6 +483,11 @@ pub fn write_dt(
         openhcl_builder = openhcl_builder.add_u64(p_vtl0_alias_map, data)?;
     }
 
+    if partition_info.dma_hint_self {
+        let p_dma_hint = openhcl_builder.add_string("dma-hint")?;
+        openhcl_builder = openhcl_builder.add_str(p_dma_hint, "self")?;
+    }
+
     #[derive(Debug, Copy, Clone, PartialEq, Eq)]
     struct Vtl2MemoryEntry {
         range: MemoryRange,
diff --git a/openhcl/openhcl_boot/src/host_params/dt.rs b/openhcl/openhcl_boot/src/host_params/dt.rs
index fbadf47feb..de72ea9676 100644
--- a/openhcl/openhcl_boot/src/host_params/dt.rs
+++ b/openhcl/openhcl_boot/src/host_params/dt.rs
@@ -456,6 +456,7 @@ impl PartitionInfo {
             .vtl2_used_ranges
             .extend(flatten_ranges(used_ranges.iter().copied()));
 
+        let mut vtl2_dma_hint_self = false;
         // Decide if we will reserve memory for a VTL2 private pool. Parse this
         // from the final command line, or the host provided device tree value.
         let vtl2_gpa_pool_size = {
@@ -472,6 +473,7 @@ impl PartitionInfo {
             {
                 // If host did not provide the DMA hint value, re-evaluate
                 // it internally if conditions satisfy.
+                vtl2_dma_hint_self = true;
                 vtl2_calculate_dma_hint(parsed.cpu_count(), storage)
             } else {
                 hostval
@@ -512,6 +514,7 @@ impl PartitionInfo {
                 .extend(flatten_ranges(used_ranges.iter().copied()));
 
             storage.vtl2_pool_memory = pool;
+            storage.dma_hint_self = vtl2_dma_hint_self;
         }
 
         // If we can trust the host, use the provided alias map
@@ -540,6 +543,7 @@ impl PartitionInfo {
             entropy,
             vtl0_alias_map: _,
             nvme_keepalive,
+            dma_hint_self,
         } = storage;
 
         assert!(!vtl2_used_ranges.is_empty());
@@ -562,6 +566,7 @@ impl PartitionInfo {
         *gic = parsed.gic.clone();
         *entropy = parsed.entropy.clone();
         *nvme_keepalive = parsed.nvme_keepalive;
+        *dma_hint_self = vtl2_dma_hint_self;
 
         Ok(Some(storage))
     }
diff --git a/openhcl/openhcl_boot/src/host_params/mod.rs b/openhcl/openhcl_boot/src/host_params/mod.rs
index 05b087f011..b53652715a 100644
--- a/openhcl/openhcl_boot/src/host_params/mod.rs
+++ b/openhcl/openhcl_boot/src/host_params/mod.rs
@@ -95,6 +95,8 @@ pub struct PartitionInfo {
     pub vtl0_alias_map: Option<u64>,
     /// Host is compatible with DMA preservation / NVMe keep-alive.
     pub nvme_keepalive: bool,
+    /// DMA hint was calculated in boot-shim instead of host.
+    pub dma_hint_self: bool,
 }
 
 impl PartitionInfo {
@@ -126,6 +128,7 @@ impl PartitionInfo {
             entropy: None,
             vtl0_alias_map: None,
             nvme_keepalive: false,
+            dma_hint_self: false,
         }
     }
 
diff --git a/openhcl/openhcl_boot/src/main.rs b/openhcl/openhcl_boot/src/main.rs
index d2cfce8e74..648f7f5be0 100644
--- a/openhcl/openhcl_boot/src/main.rs
+++ b/openhcl/openhcl_boot/src/main.rs
@@ -956,6 +956,7 @@ mod test {
             entropy: None,
             vtl0_alias_map: None,
             nvme_keepalive: false,
+            dma_hint_self: false,
         }
     }
 
diff --git a/openhcl/openhcl_dma_manager/src/lib.rs b/openhcl/openhcl_dma_manager/src/lib.rs
index 3b371cde45..76ea8c2fff 100644
--- a/openhcl/openhcl_dma_manager/src/lib.rs
+++ b/openhcl/openhcl_dma_manager/src/lib.rs
@@ -407,6 +407,20 @@ impl OpenhclDmaManager {
 
         Ok(())
     }
+
+    /// Return shared pool size in bytes.
+    pub fn shared_pool_size(&self) -> u64 {
+        self.shared_pool
+            .as_ref()
+            .map_or(0, |pool| pool.total_size())
+    }
+
+    /// Return private pool size in bytes.
+    pub fn private_pool_size(&self) -> u64 {
+        self.private_pool
+            .as_ref()
+            .map_or(0, |pool| pool.total_size())
+    }
 }
 
 /// A spawner for creating DMA clients.
diff --git a/openhcl/underhill_core/src/worker.rs b/openhcl/underhill_core/src/worker.rs
index 2fb8e487af..2c1051941a 100644
--- a/openhcl/underhill_core/src/worker.rs
+++ b/openhcl/underhill_core/src/worker.rs
@@ -1518,6 +1518,14 @@ async fn new_underhill_vm(
             .context("failed to restore global dma manager")?;
     }
 
+    // Print important info about DMA sizes.
+    tracing::info!(
+        dma_hint_self = boot_info.dma_hint_self,
+        shared_pool_size = shared_pool_size,
+        private_pool_size = dma_manager.private_pool_size(),
+        "dma pool"
+    );
+
     // Test with the highest VTL for which we have a GuestMemory object
     let highest_vtl_gm = gm.vtl1().unwrap_or(gm.vtl0());
 
diff --git a/vm/page_pool_alloc/src/lib.rs b/vm/page_pool_alloc/src/lib.rs
index 8abaca0562..69234c2484 100644
--- a/vm/page_pool_alloc/src/lib.rs
+++ b/vm/page_pool_alloc/src/lib.rs
@@ -504,6 +504,7 @@ pub struct PagePool {
     inner: Arc<PagePoolInner>,
     #[inspect(iter_by_index)]
     ranges: Vec<MemoryRange>,
+    total_len: u64,
 }
 
 impl PagePool {
@@ -557,6 +558,7 @@ impl PagePool {
                 mapping,
             }),
             ranges: memory.to_vec(),
+            total_len: total_len as u64,
         })
     }
 
@@ -621,6 +623,11 @@ impl PagePool {
             Ok(())
         }
     }
+
+    /// Returns the total size of the pool in bytes.
+    pub fn total_size(&self) -> u64 {
+        self.total_len
+    }
 }
 
 /// A spawner for [`PagePoolAllocator`] instances.

From 1c34003b3a3c0f81cf6cb7c94a0d894f4b0e4766 Mon Sep 17 00:00:00 2001
From: Yuri Pavlenko <yupavlen@microsoft.com>
Date: Mon, 28 Apr 2025 13:52:56 -0700
Subject: [PATCH 18/18] Swapped one row

---
 openhcl/openhcl_boot/src/host_params/dma_hint.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openhcl/openhcl_boot/src/host_params/dma_hint.rs b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
index e0925a74e9..6208a95602 100644
--- a/openhcl/openhcl_boot/src/host_params/dma_hint.rs
+++ b/openhcl/openhcl_boot/src/host_params/dma_hint.rs
@@ -27,7 +27,7 @@ const LOOKUP_TABLE: &[(u16, u16, u16)] = &[
     (8, 170, 20),
     (8, 176, 20),
     (16, 234, 12),
-    (16, 256, 18), // There is another configuration with '20'.
+    (16, 256, 20), // There is another configuration with '18'.
     (16, 268, 38),
     (16, 282, 54),
     (24, 420, 66),
@@ -50,7 +50,7 @@ const LOOKUP_TABLE: &[(u16, u16, u16)] = &[
     (112, 1566, 288),
     (128, 1342, 84),
     (128, 1360, 84),
-    // (896, 12912, 516), // Needs to be validated as the vNIC number is unknown.
+    (896, 12912, 0), // (516) Needs to be validated as the vNIC number is unknown.
 ];
 
 /// Round up to next 2MiB.