diff --git a/runtimes/nvidia/src/driver/device.rs b/runtimes/nvidia/src/driver/device.rs index be8a933..bc3080f 100644 --- a/runtimes/nvidia/src/driver/device.rs +++ b/runtimes/nvidia/src/driver/device.rs @@ -4,7 +4,7 @@ use std::{sync::OnceLock, vec::Vec}; pub(crate) fn devices() -> &'static Vec { static MANAGER: OnceLock> = OnceLock::new(); MANAGER.get_or_init(|| { - cuda::invoke!(cuInit(0)); + cuda::init(); let mut device_count = 0i32; cuda::invoke!(cuDeviceGetCount(&mut device_count)); diff --git a/runtimes/nvidia/src/driver/graph.rs b/runtimes/nvidia/src/driver/graph.rs index 454048d..0c18bbc 100644 --- a/runtimes/nvidia/src/driver/graph.rs +++ b/runtimes/nvidia/src/driver/graph.rs @@ -238,7 +238,7 @@ fn test_memcpy3d() { /// 测试 cuda graph 与 context 的交互行为。 #[test] fn test_graph_exec() { - cuda::invoke!(cuInit(0)); + cuda::init(); // 创建 cuda graph 不需要 context。 let mut graph: cuda::CUgraph = null_mut(); diff --git a/runtimes/nvidia/src/driver/mod.rs b/runtimes/nvidia/src/driver/mod.rs index d08daeb..e8a4a85 100644 --- a/runtimes/nvidia/src/driver/mod.rs +++ b/runtimes/nvidia/src/driver/mod.rs @@ -12,6 +12,11 @@ }}; } + #[inline(always)] + pub(crate) fn init() { + invoke!(cuInit(0)); + } + pub(super) use invoke; } @@ -21,11 +26,6 @@ mod graph; mod memory; mod stream; -#[inline(always)] -pub(crate) fn init() { - bindings::invoke!(cuInit(0)); -} - trait AsRaw { unsafe fn as_raw(&self) -> T; } @@ -34,6 +34,7 @@ trait WithCtx { unsafe fn ctx(&self) -> bindings::CUcontext; } +pub(crate) use bindings::init; pub(crate) use context::{Context, ContextGuard}; pub(crate) use device::devices; pub(crate) use graph::{ExecutableGraph, Graph}; diff --git a/runtimes/nvidia/src/graph.rs b/runtimes/nvidia/src/graph.rs index b99f96e..f265e31 100644 --- a/runtimes/nvidia/src/graph.rs +++ b/runtimes/nvidia/src/graph.rs @@ -1,7 +1,8 @@ use crate::driver::{self, ContextGuard}; +use computation::Tensor; use graph_topo::GraphTopo; use stack_calculator::{flat, unidir, RealtimeCalculator}; -use std::sync::Arc; +use std::{alloc::Layout, collections::BTreeSet, sync::Arc}; pub struct Graph { ctx: Arc, @@ -12,14 +13,6 @@ pub struct Graph { stack: driver::Blob, } -#[derive(Clone, Copy, PartialEq, Eq, Debug)] -#[repr(transparent)] -struct MemOffset(usize); - -impl MemOffset { - const INVALID: MemOffset = MemOffset(usize::MAX); -} - impl Graph { #[inline] pub fn new(src: &computation::Graph, dev: usize) -> Self { @@ -41,15 +34,47 @@ impl ContextGuard<'_> { pub fn runtime_graph(&self, src: &computation::Graph) -> Graph { let src = &src.0; - let mut flat = flat::RealtimeCalculator::default(); - let mut unidir = unidir::RealtimeCalculator::default(); + let mut static_mem = flat::RealtimeCalculator::default(); + let mut stack = unidir::RealtimeCalculator::default(); let mut edges = vec![MemOffset::INVALID; src.edges.len()]; + let mut local_edges = BTreeSet::::new(); - driver::init(); - let graph = driver::Graph::new(); + #[allow(non_camel_case_types)] + type urc = u16; + const STATIC: urc = urc::MAX; + let mut edge_rc = vec![0 as urc; src.edges.len()]; + for edge_idx in src.topology.connections() { + edge_rc[edge_idx] += 1; + } + + src.topology + .global_inputs() + .chain(src.topology.global_outputs()) + .for_each(|edge_idx| { + edge_rc[edge_idx] = STATIC; + edges[edge_idx] = MemOffset::from_static( + // 全图输入输出分配在静态存储区 + static_mem.alloc(cuda_layout(&src.edges[edge_idx])).start, + ); + }); - let mut static_mem = self.malloc(flat.peak()); + let mut graph = driver::Graph::new(); + + for (node_idx, inputs, outputs) in &src.topology { + let (op, _) = &src.nodes[node_idx]; + // TODO 分配栈空间,构造计算节点 + } + + let static_mem = { + // TODO 把分配和拷贝调度到流上异步执行 + let stream = self.stream(); + let mut static_mem = self.malloc(static_mem.peak()); + for edge_idx in local_edges { + let blob = src.edges[edge_idx].0.blob.as_ref().unwrap(); + } + static_mem + }; Graph { ctx: self.clone_ctx(), @@ -57,7 +82,33 @@ impl ContextGuard<'_> { topology: src.topology.clone(), edges, static_mem, - stack: self.malloc(unidir.peak()), + stack: self.malloc(stack.peak()), } } } + +#[inline(always)] +fn cuda_layout(edge: &(Tensor, String)) -> Layout { + edge.0.blob_mem_layout().align_to(256).unwrap() +} + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[repr(transparent)] +struct MemOffset(usize); + +impl MemOffset { + const INVALID: MemOffset = MemOffset(usize::MAX); + const BIT: usize = 1 << (usize::BITS - 1); + + fn from_static(offset: usize) -> Self { + Self(offset | Self::BIT) + } + + fn is_static(self) -> bool { + self.0 & Self::BIT != 0 + } + + fn offset(self) -> usize { + self.0 & !Self::BIT + } +}