Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Commit

Permalink
Working graph allocator for llama
Browse files Browse the repository at this point in the history
  • Loading branch information
LLukas22 committed Sep 24, 2023
1 parent 995dd79 commit 6ba5126
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 56 deletions.
31 changes: 24 additions & 7 deletions crates/ggml/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ impl ContextInner {
/// Controls how the context uses memory.
pub enum ContextStorage {
/// Use the provided buffer as memory.
Buffer(Buffer),
Buffer {
/// The buffer to use as memory.
buffer: Buffer,
/// Whether to allocate tensors into this buffer.
allocate: bool,
},
/// Use the provided memory mapped file as memory.
Mmap(Mmap),
/// Allocate `mem_size` bytes of memory.
Expand All @@ -94,7 +99,10 @@ impl ContextStorage {
/// Returns the `Buffer` if this is a `Buffer` variant.
pub fn as_buffer(&self) -> Option<&Buffer> {
match self {
Self::Buffer(v) => Some(v),
Self::Buffer {
buffer: v,
allocate: _,
} => Some(v),
_ => None,
}
}
Expand All @@ -115,7 +123,16 @@ impl PartialEq for ContextStorage {
fn eq(&self, other: &Self) -> bool {
use ContextStorage::*;
match (self, other) {
(Buffer(l0), Buffer(r0)) => l0 == r0,
(
Buffer {
buffer: l0,
allocate: l1,
},
Buffer {
buffer: r0,
allocate: r1,
},
) => l0 == r0 && l1 == r1,
(Mmap(l0), Mmap(r0)) => l0.as_ptr() == r0.as_ptr(),
(Allocate { mem_size: l }, Allocate { mem_size: r }) => l == r,
_ => false,
Expand All @@ -130,10 +147,10 @@ impl Context {
/// Creates a new [Context] with the given storage.
pub fn new(storage: ContextStorage) -> Self {
let init_params = match &storage {
ContextStorage::Buffer(buffer) => sys::ggml_init_params {
ContextStorage::Buffer { buffer, allocate } => sys::ggml_init_params {
mem_size: buffer.size(),
mem_buffer: buffer.data,
no_alloc: false,
no_alloc: !allocate,
},
ContextStorage::Mmap(mmap) => sys::ggml_init_params {
mem_size: mmap.len(),
Expand All @@ -160,8 +177,8 @@ impl Context {

/// Creates a new [Context] with the specified buffer.
/// The buffer will be used by GGML.
pub fn new_with_buffer(buffer: Buffer) -> Self {
Self::new(ContextStorage::Buffer(buffer))
pub fn new_with_buffer(buffer: Buffer, allocate: bool) -> Self {
Self::new(ContextStorage::Buffer { buffer, allocate })
}

/// Creates a new [Context] with the specified memory mapped file.
Expand Down
32 changes: 13 additions & 19 deletions crates/ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize;
/// Default epsilon to use for RMS computation.
pub const DEFAULT_EPS: f32 = 0.000005;

/// Maximum number of nodes in a `ggml` graph.
pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize;

/// Value overrides to use for RoPE.
///
/// Formula: `theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]`
Expand Down Expand Up @@ -348,26 +351,12 @@ impl GraphExecutionPlan {
}
}

/// Creates a [Type::I8] work buffer with size `plan.work_size` for this [GraphExecutionPlan] in the given [Context].
fn create_work_buffer(&mut self, context: &Context) -> Tensor {
context.new_tensor_1d(Type::I8, self.inner.work_size)
}

/// Assign a work buffer to this [GraphExecutionPlan].
fn assign_work_buffer(&mut self, buffer: &mut Tensor) {
assert!(
buffer.get_type() == Type::I8,
"Work buffer must be of type i8"
);
unsafe {
self.inner.work_data = buffer.data().cast();
}
}

/// Execute this [GraphExecutionPlan] in the given [Context].
pub fn execute(&mut self, context: &Context) {
let mut work_buffer = self.create_work_buffer(context);
self.assign_work_buffer(&mut work_buffer);
pub fn execute(&mut self, buffer: &mut Vec<u8>) {
if self.inner.work_size > 0 {
buffer.resize(self.inner.work_size, 0);
self.inner.work_data = buffer.as_mut_ptr().cast();
}

unsafe {
sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
Expand Down Expand Up @@ -556,3 +545,8 @@ pub fn cpu_has_gpublas() -> bool {
pub fn graph_overhead() -> usize {
unsafe { sys::ggml_graph_overhead() }
}

/// Returns the tensor overhead in bytes.
pub fn tensor_overhead() -> usize {
unsafe { sys::ggml_tensor_overhead() }
}
48 changes: 22 additions & 26 deletions crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ pub struct InferenceSession {

///Context size of this session
context_size: usize,

/// Work buffer for graph planing
work_buffer: Vec<u8>,
}

pub struct BuildContext<'session> {
Expand Down Expand Up @@ -146,24 +149,11 @@ impl InferenceSession {
let n_elements = n_embd * n_mem;
let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);

// Allocate buffer for storing intermediate values during evaluation (ctx0 backing)
// For the first run, we need to guess a maximum buffer size so we can measure
// the actual memory consumption of the temporary ggml context.
//
// These numbers are from `llama.cpp`, and could potentially be more efficient.
let buf_size = {
let buf_size_mb = if n_layer >= 80 {
1536
} else if n_layer >= 60 {
1280
} else {
1024
};
buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
};

// Allocate buffer for storing tensor and graph structs
// Should be 1540816
let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES);
let eval = Buffer::new(buf_size);
let ctx0 = ggml::Context::new_with_buffer(eval);
let ctx0 = ggml::Context::new_with_buffer(eval, false);

let allocator = GraphAllocator::new_measurement(32);
// Set up Metal support
Expand Down Expand Up @@ -200,6 +190,7 @@ impl InferenceSession {
n_embd,
allocator,
context_size,
work_buffer: vec![0],
}
}

Expand All @@ -213,12 +204,12 @@ impl InferenceSession {
where
F: Fn(BuildContext) -> (ComputationGraph, GraphOutputs),
{
// Build a graph
self.ctx0.recreate();
let ctx0 = &mut self.ctx0;

// Check if we need to allocate the graph
if self.allocator.in_measuring_mode() {
// Build a graph
self.ctx0.recreate();
let ctx0 = &mut self.ctx0;

// If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
let tensor_alignment = 32;

Expand All @@ -240,13 +231,18 @@ impl InferenceSession {
n_past: max_n_past,
};

let (worst_case_graph, _) = builder(bc);
let (mut worst_case_graph, built_result) = builder(bc);
worst_case_graph.build_forward_expand(&built_result.result);
// Should be 73924640
let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment;
let buffer = Buffer::new(graph_size);

self.allocator.switch_buffer(buffer, tensor_alignment);
}

self.ctx0.recreate();
let ctx0 = &mut self.ctx0;

let mut embd = ctx0
.new_tensor_1d(ggml::Type::I32, input_tokens.len())
.set_name("embd");
Expand All @@ -266,6 +262,9 @@ impl InferenceSession {

let (mut built_gf, built_result) = builder(bc);

// Build the graph
built_gf.build_forward_expand(&built_result.result);

// Allocate the graph
self.allocator.allocate_graph(&built_gf);

Expand All @@ -280,9 +279,6 @@ impl InferenceSession {
// Write input tokens
unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };

// Compute the graph
built_gf.build_forward_expand(&built_result.result);

#[cfg(feature = "metal")]
{
// FIXME can only process one token at a time currently
Expand All @@ -303,7 +299,7 @@ impl InferenceSession {
#[cfg(not(feature = "metal"))]
{
let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads);
plan.execute(ctx0);
plan.execute(&mut self.work_buffer);
}

// Adjust the required memory per token if we didn't know that already
Expand Down
3 changes: 2 additions & 1 deletion crates/llm-base/src/lora.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,9 @@ impl LoraAdapter {
gf.build_forward_expand(&output);

//TODO: maybe pass the model's thread count to this context
let mut work_buffer = vec![0u8];
let mut plan = GraphExecutionPlan::new(&mut gf, 8);
plan.execute(&patch_context);
plan.execute(&mut work_buffer);

// Overwrite the original tensor.
// The `output` and the `target_tensor` are not from the same context,
Expand Down
2 changes: 0 additions & 2 deletions crates/models/llama/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,10 @@ impl KnownModel for Llama {
let input_len = builder.embd.nelements();

let mut ctx0 = builder.ctx0.borrow_mut();
let allocator = builder.allocator.borrow();

let embd = builder.embd;

let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
allocator.allocate(&input_layer);

let mut gf = ctx0.create_compute_graph();

Expand Down

0 comments on commit 6ba5126

Please sign in to comment.