feat(runtimes/nvidia): 实现构图的外围逻辑

Signed-off-by: YdrMaster <[email protected]>
InfiniTensor · Jan 2, 2024 · f88d16c · f88d16c
1 parent a8eb0f0
commit f88d16c
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 22 deletions.
diff --git a/runtimes/nvidia/src/driver/device.rs b/runtimes/nvidia/src/driver/device.rs
@@ -4,7 +4,7 @@ use std::{sync::OnceLock, vec::Vec};
 pub(crate) fn devices() -> &'static Vec<Device> {
     static MANAGER: OnceLock<Vec<Device>> = OnceLock::new();
     MANAGER.get_or_init(|| {
-        cuda::invoke!(cuInit(0));
+        cuda::init();
 
         let mut device_count = 0i32;
         cuda::invoke!(cuDeviceGetCount(&mut device_count));

diff --git a/runtimes/nvidia/src/driver/graph.rs b/runtimes/nvidia/src/driver/graph.rs
@@ -238,7 +238,7 @@ fn test_memcpy3d() {
 /// 测试 cuda graph 与 context 的交互行为。
 #[test]
 fn test_graph_exec() {
-    cuda::invoke!(cuInit(0));
+    cuda::init();
 
     // 创建 cuda graph 不需要 context。
     let mut graph: cuda::CUgraph = null_mut();

diff --git a/runtimes/nvidia/src/driver/mod.rs b/runtimes/nvidia/src/driver/mod.rs
@@ -12,6 +12,11 @@
         }};
     }
 
+    #[inline(always)]
+    pub(crate) fn init() {
+        invoke!(cuInit(0));
+    }
+
     pub(super) use invoke;
 }
 
@@ -21,11 +26,6 @@ mod graph;
 mod memory;
 mod stream;
 
-#[inline(always)]
-pub(crate) fn init() {
-    bindings::invoke!(cuInit(0));
-}
-
 trait AsRaw<T> {
     unsafe fn as_raw(&self) -> T;
 }
@@ -34,6 +34,7 @@ trait WithCtx {
     unsafe fn ctx(&self) -> bindings::CUcontext;
 }
 
+pub(crate) use bindings::init;
 pub(crate) use context::{Context, ContextGuard};
 pub(crate) use device::devices;
 pub(crate) use graph::{ExecutableGraph, Graph};

diff --git a/runtimes/nvidia/src/graph.rs b/runtimes/nvidia/src/graph.rs
@@ -1,7 +1,8 @@
 use crate::driver::{self, ContextGuard};
+use computation::Tensor;
 use graph_topo::GraphTopo;
 use stack_calculator::{flat, unidir, RealtimeCalculator};
-use std::sync::Arc;
+use std::{alloc::Layout, collections::BTreeSet, sync::Arc};
 
 pub struct Graph {
     ctx: Arc<driver::Context>,
@@ -12,14 +13,6 @@ pub struct Graph {
     stack: driver::Blob,
 }
 
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-#[repr(transparent)]
-struct MemOffset(usize);
-
-impl MemOffset {
-    const INVALID: MemOffset = MemOffset(usize::MAX);
-}
-
 impl Graph {
     #[inline]
     pub fn new(src: &computation::Graph, dev: usize) -> Self {
@@ -41,23 +34,81 @@ impl ContextGuard<'_> {
     pub fn runtime_graph(&self, src: &computation::Graph) -> Graph {
         let src = &src.0;
 
-        let mut flat = flat::RealtimeCalculator::default();
-        let mut unidir = unidir::RealtimeCalculator::default();
+        let mut static_mem = flat::RealtimeCalculator::default();
+        let mut stack = unidir::RealtimeCalculator::default();
 
         let mut edges = vec![MemOffset::INVALID; src.edges.len()];
+        let mut local_edges = BTreeSet::<usize>::new();
 
-        driver::init();
-        let graph = driver::Graph::new();
+        #[allow(non_camel_case_types)]
+        type urc = u16;
+        const STATIC: urc = urc::MAX;
+        let mut edge_rc = vec![0 as urc; src.edges.len()];
+        for edge_idx in src.topology.connections() {
+            edge_rc[edge_idx] += 1;
+        }
+
+        src.topology
+            .global_inputs()
+            .chain(src.topology.global_outputs())
+            .for_each(|edge_idx| {
+                edge_rc[edge_idx] = STATIC;
+                edges[edge_idx] = MemOffset::from_static(
+                    // 全图输入输出分配在静态存储区
+                    static_mem.alloc(cuda_layout(&src.edges[edge_idx])).start,
+                );
+            });
 
-        let mut static_mem = self.malloc(flat.peak());
+        let mut graph = driver::Graph::new();
+
+        for (node_idx, inputs, outputs) in &src.topology {
+            let (op, _) = &src.nodes[node_idx];
+            // TODO 分配栈空间，构造计算节点
+        }
+
+        let static_mem = {
+            // TODO 把分配和拷贝调度到流上异步执行
+            let stream = self.stream();
+            let mut static_mem = self.malloc(static_mem.peak());
+            for edge_idx in local_edges {
+                let blob = src.edges[edge_idx].0.blob.as_ref().unwrap();
+            }
+            static_mem
+        };
 
         Graph {
             ctx: self.clone_ctx(),
             graph: graph.instantiate(self),
             topology: src.topology.clone(),
             edges,
             static_mem,
-            stack: self.malloc(unidir.peak()),
+            stack: self.malloc(stack.peak()),
         }
     }
 }
+
+#[inline(always)]
+fn cuda_layout(edge: &(Tensor, String)) -> Layout {
+    edge.0.blob_mem_layout().align_to(256).unwrap()
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+#[repr(transparent)]
+struct MemOffset(usize);
+
+impl MemOffset {
+    const INVALID: MemOffset = MemOffset(usize::MAX);
+    const BIT: usize = 1 << (usize::BITS - 1);
+
+    fn from_static(offset: usize) -> Self {
+        Self(offset | Self::BIT)
+    }
+
+    fn is_static(self) -> bool {
+        self.0 & Self::BIT != 0
+    }
+
+    fn offset(self) -> usize {
+        self.0 & !Self::BIT
+    }
+}