feat(runtimes/nvidia): 实现构图中的显存分配

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Jan 3, 2024 · 8997de6 · 8997de6
1 parent 54594c9
commit 8997de6
Show file tree

Hide file tree

Showing 4 changed files with 243 additions and 50 deletions.
diff --git a/runtimes/nvidia/src/driver/memory.rs b/runtimes/nvidia/src/driver/memory.rs
@@ -19,6 +19,20 @@ impl ContextGuard<'_> {
     }
 }
 
+impl Stream<'_> {
+    #[inline]
+    pub fn malloc(&self, size: usize) -> DevicePtr {
+        let mut ptr: cuda::CUdeviceptr = 0;
+        cuda::invoke!(cuMemAllocAsync(&mut ptr, size, self.as_raw()));
+        DevicePtr(ptr)
+    }
+
+    #[inline]
+    pub fn free(&self, ptr: DevicePtr) {
+        cuda::invoke!(cuMemFreeAsync(ptr.0, self.as_raw()));
+    }
+}
+
 impl Drop for DevicePtr {
     #[inline]
     fn drop(&mut self) {

diff --git a/runtimes/nvidia/src/graph.rs b/runtimes/nvidia/src/graph.rs
@@ -1,16 +1,18 @@
-use crate::driver::{self, ContextGuard};
-use computation::Tensor;
-use graph_topo::GraphTopo;
+use crate::{
+    driver::{self, ContextGuard},
+    kernel::{GraphBuilder, GraphUser, Resources},
+};
 use stack_calculator::{flat, unidir, RealtimeCalculator};
 use std::{alloc::Layout, collections::BTreeSet, sync::Arc};
 
 pub struct Graph {
     ctx: Arc<driver::Context>,
-    graph: driver::ExecutableGraph,
-    topology: GraphTopo,
-    edges: Vec<MemOffset>,
+    executable: driver::ExecutableGraph,
+    #[allow(unused)] // stay here to keep resource lifetime
+    resources: Resources,
     static_mem: driver::DevicePtr,
     stack: driver::DevicePtr,
+    offsets: graph_topo::Graph<usize, MemOffset>,
 }
 
 impl Drop for Graph {
@@ -34,23 +36,23 @@ impl Graph {
     pub fn run(&self) {
         self.ctx.apply(|ctx| {
             let stream = ctx.stream();
-            unsafe { self.graph.launch_on(&stream) }
+            unsafe { self.executable.launch_on(&stream) }
         })
     }
 
     #[inline]
     pub fn copy_in_one<T>(&mut self, i: usize, data: &[T]) {
-        let i = self.topology.global_inputs().nth(i).unwrap();
-        let offset = self.edges[i].offset();
+        let i = self.offsets.topology.global_inputs().nth(i).unwrap();
+        let offset = self.offsets.edges[i].offset();
         self.ctx.apply(|ctx| unsafe {
             self.static_mem.copy_in(offset, data, ctx);
         });
     }
 
     #[inline]
     pub fn copy_out_one<T>(&mut self, i: usize, data: &mut [T]) {
-        let i = self.topology.global_outputs()[i];
-        let offset = self.edges[i as usize].offset();
+        let i = self.offsets.topology.global_outputs()[i];
+        let offset = self.offsets.edges[i as usize].offset();
         self.ctx.apply(|ctx| unsafe {
             self.static_mem.copy_out(offset, data, ctx);
         });
@@ -61,11 +63,11 @@ impl Graph {
     where
         I: IntoIterator<Item = (&'a usize, &'a [T])>,
     {
-        let start = self.topology.global_inputs().start;
+        let start = self.offsets.topology.global_inputs().start;
         self.ctx.apply(|ctx| {
             let stream = ctx.stream();
             for (i, data) in data {
-                let offset = self.edges[start + i].offset();
+                let offset = self.offsets.edges[start + i].offset();
                 unsafe { self.static_mem.copy_in_async(offset, data, &stream) };
             }
         });
@@ -76,102 +78,224 @@ impl Graph {
     where
         I: IntoIterator<Item = (&'a usize, &'a mut [T])>,
     {
-        let global_output = self.topology.global_outputs();
+        let global_output = self.offsets.topology.global_outputs();
         self.ctx.apply(|ctx| {
             let stream = ctx.stream();
             for (i, data) in data {
-                let offset = self.edges[global_output[*i] as usize].offset();
+                let offset = self.offsets.edges[global_output[*i] as usize].offset();
                 unsafe { self.static_mem.copy_out_async(offset, data, &stream) };
             }
         });
     }
 }
 
+#[allow(non_camel_case_types)]
+type urc = u16;
+const STATIC: urc = urc::MAX;
+const CUDA_ALIGN: usize = 256;
+
 impl ContextGuard<'_> {
     pub fn runtime_graph(&self, src: &computation::Graph) -> Graph {
-        let src = &src.0;
-
-        let mut static_mem = flat::RealtimeCalculator::default();
+        let mut static_mem: flat::RealtimeCalculator = flat::RealtimeCalculator::default();
         let mut stack = unidir::RealtimeCalculator::default();
 
-        let mut edges = vec![MemOffset::INVALID; src.edges.len()];
+        let mut nodes = vec![usize::MAX; src.0.nodes.len()];
+        let mut edges = vec![MemOffset::INVALID; src.0.edges.len()];
         let mut local_edges = BTreeSet::<usize>::new();
 
-        #[allow(non_camel_case_types)]
-        type urc = u16;
-        const STATIC: urc = urc::MAX;
-        let mut edge_rc = vec![0 as urc; src.edges.len()];
-        for edge_idx in src.topology.connections() {
+        // 计算边引用计数
+        let mut edge_rc = vec![0 as urc; src.0.edges.len()];
+        for edge_idx in src.0.topology.connections() {
             edge_rc[edge_idx] += 1;
         }
 
-        src.topology
+        // 为输入输出分配静态存储区
+        src.0
+            .topology
             .global_inputs()
-            .chain(src.topology.global_outputs())
+            .chain(src.0.topology.global_outputs())
             .for_each(|edge_idx| {
-                edge_rc[edge_idx] = STATIC;
-                edges[edge_idx] = MemOffset::from_static(
-                    // 全图输入输出分配在静态存储区
-                    static_mem.alloc(cuda_layout(&src.edges[edge_idx])).start,
-                );
+                alloc_static(src, edge_idx, &mut edges, &mut edge_rc, &mut static_mem)
             });
 
-        let mut graph = driver::Graph::new();
+        // 计算工作空间需求，分配栈空间
+        let mut builders = Vec::<Box<dyn GraphBuilder>>::with_capacity(src.0.nodes.len());
+        let mut resources = Resources::default();
+        for (node_idx, inputs, outputs) in &src.0.topology {
+            let (op, _) = &src.0.nodes[node_idx];
+            let builder = op.builder(&mut resources, self);
+            let workspace = builder.worksapce().align_to(CUDA_ALIGN).unwrap();
+            builders.push(builder);
 
-        for (node_idx, inputs, outputs) in &src.topology {
-            let (op, _) = &src.nodes[node_idx];
-            // TODO 分配栈空间，构造计算节点
+            // alloc for outputs
+            for edge_idx in outputs.clone() {
+                if edge_rc[edge_idx] != STATIC {
+                    alloc_stack(src, edge_idx, &mut edges, &mut stack);
+                }
+            }
+            // alloc for workspaces
+            alloc_workspace(workspace, node_idx, &mut nodes, &mut stack);
+            // free for temp outputs
+            for edge_idx in outputs {
+                if edge_rc[edge_idx] == 0 {
+                    free_stack(src, edge_idx, &edges[edge_idx], &mut stack);
+                }
+            }
+            // free for inputs or alloc for local static inputs
+            for edge_idx in inputs {
+                let offset = edges[edge_idx];
+                if offset == MemOffset::INVALID {
+                    local_edges.insert(edge_idx);
+                    alloc_static(src, edge_idx, &mut edges, &mut edge_rc, &mut static_mem);
+                } else {
+                    let rc = &mut edge_rc[edge_idx];
+                    debug_assert_ne!(*rc, 0);
+                    *rc -= 1;
+                    if *rc == 0 {
+                        free_stack(src, edge_idx, &offset, &mut stack);
+                    }
+                }
+            }
         }
 
-        let static_mem = {
+        // 实际分配显存空间
+        let resources = resources;
+        let edges = edges;
+        let (static_mem, stack) = {
             let stream = self.stream();
-            let mut static_mem = self.malloc(static_mem.peak());
+
+            let mut static_mem = stream.malloc(static_mem.peak());
+            let stack = stream.malloc(stack.peak());
+
             for edge_idx in local_edges {
                 let offset = edges[edge_idx].offset();
-                let tensor = &src.edges[edge_idx].0;
+                let tensor = &src.0.edges[edge_idx].0;
                 let ptr = tensor.blob.as_ref().unwrap().get().cast::<u8>();
                 let len = tensor.blob_mem_layout().size();
                 unsafe {
                     let data = std::slice::from_raw_parts(ptr, len);
                     static_mem.copy_in_async(offset, data, &stream);
                 }
             }
-            static_mem
+
+            (static_mem, stack)
         };
 
+        let mut graph = driver::Graph::new();
+        for (node_idx, inputs, outputs) in &src.0.topology {
+            // TODO 计算实际地址
+            let mut temp = Vec::with_capacity(1 + inputs.len() + outputs.len());
+            temp.extend(inputs.iter().map(|i| edges[*i as usize]).map(|offset| {
+                if offset.is_static() {
+                    todo!()
+                } else {
+                    todo!()
+                }
+            }));
+            builders[node_idx].push_to(
+                &mut graph,
+                &resources,
+                &temp[0],
+                &temp[1..][..inputs.len()],
+                &temp[1 + inputs.len()..],
+            )
+        }
+
         Graph {
             ctx: self.clone_ctx(),
-            graph: graph.instantiate(self),
-            topology: src.topology.clone(),
-            edges,
+            executable: graph.instantiate(self),
+            resources,
             static_mem,
-            stack: self.malloc(stack.peak()),
+            stack,
+            offsets: graph_topo::Graph {
+                topology: src.0.topology.clone(),
+                nodes,
+                edges,
+            },
         }
     }
 }
 
-#[inline(always)]
-fn cuda_layout(edge: &(Tensor, String)) -> Layout {
-    edge.0.blob_mem_layout().align_to(256).unwrap()
+fn alloc_workspace(
+    workspace: Layout,
+    node_idx: usize,
+    nodes: &mut [usize],
+    stack: &mut unidir::RealtimeCalculator,
+) {
+    let workspace = stack.alloc(workspace);
+    nodes[node_idx] = workspace.start;
+    stack.free(workspace);
+}
+
+fn alloc_stack(
+    src: &computation::Graph,
+    edge_idx: usize,
+    edges: &mut [MemOffset],
+    calculator: &mut unidir::RealtimeCalculator,
+) {
+    let layout = src.0.edges[edge_idx]
+        .0
+        .blob_mem_layout()
+        .align_to(CUDA_ALIGN)
+        .unwrap();
+    let offset = calculator.alloc(layout).start;
+    edges[edge_idx] = MemOffset::from_stack(offset);
+}
+
+fn free_stack(
+    src: &computation::Graph,
+    edge_idx: usize,
+    offset: &MemOffset,
+    calculator: &mut unidir::RealtimeCalculator,
+) {
+    let start = offset.offset();
+    let len = src.0.edges[edge_idx].0.blob_mem_layout().size();
+    calculator.free(start..start + len);
+}
+
+fn alloc_static(
+    src: &computation::Graph,
+    edge_idx: usize,
+    edges: &mut [MemOffset],
+    edge_rc: &mut [urc],
+    calculator: &mut flat::RealtimeCalculator,
+) {
+    let layout = src.0.edges[edge_idx]
+        .0
+        .blob_mem_layout()
+        .align_to(CUDA_ALIGN)
+        .unwrap();
+    let offset = calculator.alloc(layout).start;
+    edges[edge_idx] = MemOffset::from_static(offset);
+    edge_rc[edge_idx] = STATIC;
 }
 
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
 #[repr(transparent)]
 struct MemOffset(usize);
 
 impl MemOffset {
-    const INVALID: MemOffset = MemOffset(usize::MAX);
+    const INVALID: Self = Self(usize::MAX);
     const BIT: usize = 1 << (usize::BITS - 1);
 
-    fn from_static(offset: usize) -> Self {
+    #[inline]
+    const fn from_static(offset: usize) -> Self {
+        Self(offset)
+    }
+
+    #[inline]
+    const fn from_stack(offset: usize) -> Self {
         Self(offset | Self::BIT)
     }
 
-    fn is_static(self) -> bool {
-        self.0 & Self::BIT != 0
+    #[inline]
+    const fn is_static(self) -> bool {
+        self.0 & Self::BIT == 0
     }
 
+    #[inline]
     fn offset(self) -> usize {
+        debug_assert_ne!(self, Self::INVALID);
         self.0 & !Self::BIT
     }
 }