Skip to content

Commit

Permalink
feat(runtimes/nvidia): 实现构图的外围逻辑
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <[email protected]>
  • Loading branch information
YdrMaster committed Jan 2, 2024
1 parent a8eb0f0 commit f88d16c
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 22 deletions.
2 changes: 1 addition & 1 deletion runtimes/nvidia/src/driver/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::{sync::OnceLock, vec::Vec};
pub(crate) fn devices() -> &'static Vec<Device> {
static MANAGER: OnceLock<Vec<Device>> = OnceLock::new();
MANAGER.get_or_init(|| {
cuda::invoke!(cuInit(0));
cuda::init();

let mut device_count = 0i32;
cuda::invoke!(cuDeviceGetCount(&mut device_count));
Expand Down
2 changes: 1 addition & 1 deletion runtimes/nvidia/src/driver/graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ fn test_memcpy3d() {
/// 测试 cuda graph 与 context 的交互行为。
#[test]
fn test_graph_exec() {
cuda::invoke!(cuInit(0));
cuda::init();

// 创建 cuda graph 不需要 context。
let mut graph: cuda::CUgraph = null_mut();
Expand Down
11 changes: 6 additions & 5 deletions runtimes/nvidia/src/driver/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
}};
}

#[inline(always)]
pub(crate) fn init() {
invoke!(cuInit(0));
}

pub(super) use invoke;
}

Expand All @@ -21,11 +26,6 @@ mod graph;
mod memory;
mod stream;

#[inline(always)]
pub(crate) fn init() {
bindings::invoke!(cuInit(0));
}

trait AsRaw<T> {
unsafe fn as_raw(&self) -> T;
}
Expand All @@ -34,6 +34,7 @@ trait WithCtx {
unsafe fn ctx(&self) -> bindings::CUcontext;
}

pub(crate) use bindings::init;
pub(crate) use context::{Context, ContextGuard};
pub(crate) use device::devices;
pub(crate) use graph::{ExecutableGraph, Graph};
Expand Down
81 changes: 66 additions & 15 deletions runtimes/nvidia/src/graph.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use crate::driver::{self, ContextGuard};
use computation::Tensor;
use graph_topo::GraphTopo;
use stack_calculator::{flat, unidir, RealtimeCalculator};
use std::sync::Arc;
use std::{alloc::Layout, collections::BTreeSet, sync::Arc};

pub struct Graph {
ctx: Arc<driver::Context>,
Expand All @@ -12,14 +13,6 @@ pub struct Graph {
stack: driver::Blob,
}

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[repr(transparent)]
struct MemOffset(usize);

impl MemOffset {
const INVALID: MemOffset = MemOffset(usize::MAX);
}

impl Graph {
#[inline]
pub fn new(src: &computation::Graph, dev: usize) -> Self {
Expand All @@ -41,23 +34,81 @@ impl ContextGuard<'_> {
pub fn runtime_graph(&self, src: &computation::Graph) -> Graph {
let src = &src.0;

let mut flat = flat::RealtimeCalculator::default();
let mut unidir = unidir::RealtimeCalculator::default();
let mut static_mem = flat::RealtimeCalculator::default();
let mut stack = unidir::RealtimeCalculator::default();

let mut edges = vec![MemOffset::INVALID; src.edges.len()];
let mut local_edges = BTreeSet::<usize>::new();

driver::init();
let graph = driver::Graph::new();
#[allow(non_camel_case_types)]
type urc = u16;
const STATIC: urc = urc::MAX;
let mut edge_rc = vec![0 as urc; src.edges.len()];
for edge_idx in src.topology.connections() {
edge_rc[edge_idx] += 1;
}

src.topology
.global_inputs()
.chain(src.topology.global_outputs())
.for_each(|edge_idx| {
edge_rc[edge_idx] = STATIC;
edges[edge_idx] = MemOffset::from_static(
// 全图输入输出分配在静态存储区
static_mem.alloc(cuda_layout(&src.edges[edge_idx])).start,
);
});

let mut static_mem = self.malloc(flat.peak());
let mut graph = driver::Graph::new();

for (node_idx, inputs, outputs) in &src.topology {
let (op, _) = &src.nodes[node_idx];
// TODO 分配栈空间,构造计算节点
}

let static_mem = {
// TODO 把分配和拷贝调度到流上异步执行
let stream = self.stream();
let mut static_mem = self.malloc(static_mem.peak());
for edge_idx in local_edges {
let blob = src.edges[edge_idx].0.blob.as_ref().unwrap();
}
static_mem
};

Graph {
ctx: self.clone_ctx(),
graph: graph.instantiate(self),
topology: src.topology.clone(),
edges,
static_mem,
stack: self.malloc(unidir.peak()),
stack: self.malloc(stack.peak()),
}
}
}

#[inline(always)]
fn cuda_layout(edge: &(Tensor, String)) -> Layout {
edge.0.blob_mem_layout().align_to(256).unwrap()
}

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[repr(transparent)]
struct MemOffset(usize);

impl MemOffset {
const INVALID: MemOffset = MemOffset(usize::MAX);
const BIT: usize = 1 << (usize::BITS - 1);

fn from_static(offset: usize) -> Self {
Self(offset | Self::BIT)
}

fn is_static(self) -> bool {
self.0 & Self::BIT != 0
}

fn offset(self) -> usize {
self.0 & !Self::BIT
}
}

0 comments on commit f88d16c

Please sign in to comment.