diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 6f7a593f..2439e2a7 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -73,7 +73,12 @@ impl ContextInner { /// Controls how the context uses memory. pub enum ContextStorage { /// Use the provided buffer as memory. - Buffer(Buffer), + Buffer { + /// The buffer to use as memory. + buffer: Buffer, + /// Whether to allocate tensors into this buffer. + allocate: bool, + }, /// Use the provided memory mapped file as memory. Mmap(Mmap), /// Allocate `mem_size` bytes of memory. @@ -94,7 +99,10 @@ impl ContextStorage { /// Returns the `Buffer` if this is a `Buffer` variant. pub fn as_buffer(&self) -> Option<&Buffer> { match self { - Self::Buffer(v) => Some(v), + Self::Buffer { + buffer: v, + allocate: _, + } => Some(v), _ => None, } } @@ -115,7 +123,16 @@ impl PartialEq for ContextStorage { fn eq(&self, other: &Self) -> bool { use ContextStorage::*; match (self, other) { - (Buffer(l0), Buffer(r0)) => l0 == r0, + ( + Buffer { + buffer: l0, + allocate: l1, + }, + Buffer { + buffer: r0, + allocate: r1, + }, + ) => l0 == r0 && l1 == r1, (Mmap(l0), Mmap(r0)) => l0.as_ptr() == r0.as_ptr(), (Allocate { mem_size: l }, Allocate { mem_size: r }) => l == r, _ => false, @@ -130,10 +147,10 @@ impl Context { /// Creates a new [Context] with the given storage. pub fn new(storage: ContextStorage) -> Self { let init_params = match &storage { - ContextStorage::Buffer(buffer) => sys::ggml_init_params { + ContextStorage::Buffer { buffer, allocate } => sys::ggml_init_params { mem_size: buffer.size(), mem_buffer: buffer.data, - no_alloc: false, + no_alloc: !allocate, }, ContextStorage::Mmap(mmap) => sys::ggml_init_params { mem_size: mmap.len(), @@ -160,8 +177,8 @@ impl Context { /// Creates a new [Context] with the specified buffer. /// The buffer will be used by GGML. - pub fn new_with_buffer(buffer: Buffer) -> Self { - Self::new(ContextStorage::Buffer(buffer)) + pub fn new_with_buffer(buffer: Buffer, allocate: bool) -> Self { + Self::new(ContextStorage::Buffer { buffer, allocate }) } /// Creates a new [Context] with the specified memory mapped file. diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 507b1f60..66ed47f9 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -131,6 +131,9 @@ pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize; /// Default epsilon to use for RMS computation. pub const DEFAULT_EPS: f32 = 0.000005; +/// Maximum number of nodes in a `ggml` graph. +pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize; + /// Value overrides to use for RoPE. /// /// Formula: `theta_i = scale * base^(āˆ’2(iāˆ’1)/d), for i in [1, 2, ..., d/2]` @@ -348,26 +351,12 @@ impl GraphExecutionPlan { } } - /// Creates a [Type::I8] work buffer with size `plan.work_size` for this [GraphExecutionPlan] in the given [Context]. - fn create_work_buffer(&mut self, context: &Context) -> Tensor { - context.new_tensor_1d(Type::I8, self.inner.work_size) - } - - /// Assign a work buffer to this [GraphExecutionPlan]. - fn assign_work_buffer(&mut self, buffer: &mut Tensor) { - assert!( - buffer.get_type() == Type::I8, - "Work buffer must be of type i8" - ); - unsafe { - self.inner.work_data = buffer.data().cast(); - } - } - /// Execute this [GraphExecutionPlan] in the given [Context]. - pub fn execute(&mut self, context: &Context) { - let mut work_buffer = self.create_work_buffer(context); - self.assign_work_buffer(&mut work_buffer); + pub fn execute(&mut self, buffer: &mut Vec) { + if self.inner.work_size > 0 { + buffer.resize(self.inner.work_size, 0); + self.inner.work_data = buffer.as_mut_ptr().cast(); + } unsafe { sys::ggml_graph_compute(self.inner_graph, &mut self.inner); @@ -556,3 +545,8 @@ pub fn cpu_has_gpublas() -> bool { pub fn graph_overhead() -> usize { unsafe { sys::ggml_graph_overhead() } } + +/// Returns the tensor overhead in bytes. +pub fn tensor_overhead() -> usize { + unsafe { sys::ggml_tensor_overhead() } +} diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index b08e75ba..c091cdfb 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit b08e75baea294e366628b898e85c0bd359b58115 +Subproject commit c091cdfb24621710c617ea85c92fcd347d0bf340 diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 287fe51a..b7495205 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -83,6 +83,9 @@ pub struct InferenceSession { ///Context size of this session context_size: usize, + + /// Work buffer for graph planing + work_buffer: Vec, } pub struct BuildContext<'session> { @@ -146,24 +149,11 @@ impl InferenceSession { let n_elements = n_embd * n_mem; let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements); - // Allocate buffer for storing intermediate values during evaluation (ctx0 backing) - // For the first run, we need to guess a maximum buffer size so we can measure - // the actual memory consumption of the temporary ggml context. - // - // These numbers are from `llama.cpp`, and could potentially be more efficient. - let buf_size = { - let buf_size_mb = if n_layer >= 80 { - 1536 - } else if n_layer >= 60 { - 1280 - } else { - 1024 - }; - buf_size_mb * 1024 * 1024 + ggml::graph_overhead() - }; - + // Allocate buffer for storing tensor and graph structs + // Should be 1540816 + let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES); let eval = Buffer::new(buf_size); - let ctx0 = ggml::Context::new_with_buffer(eval); + let ctx0 = ggml::Context::new_with_buffer(eval, false); let allocator = GraphAllocator::new_measurement(32); // Set up Metal support @@ -200,6 +190,7 @@ impl InferenceSession { n_embd, allocator, context_size, + work_buffer: vec![0], } } @@ -213,12 +204,12 @@ impl InferenceSession { where F: Fn(BuildContext) -> (ComputationGraph, GraphOutputs), { - // Build a graph - self.ctx0.recreate(); - let ctx0 = &mut self.ctx0; - // Check if we need to allocate the graph if self.allocator.in_measuring_mode() { + // Build a graph + self.ctx0.recreate(); + let ctx0 = &mut self.ctx0; + // If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens. let tensor_alignment = 32; @@ -240,13 +231,18 @@ impl InferenceSession { n_past: max_n_past, }; - let (worst_case_graph, _) = builder(bc); + let (mut worst_case_graph, built_result) = builder(bc); + worst_case_graph.build_forward_expand(&built_result.result); + // Should be 73924640 let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment; let buffer = Buffer::new(graph_size); self.allocator.switch_buffer(buffer, tensor_alignment); } + self.ctx0.recreate(); + let ctx0 = &mut self.ctx0; + let mut embd = ctx0 .new_tensor_1d(ggml::Type::I32, input_tokens.len()) .set_name("embd"); @@ -266,6 +262,9 @@ impl InferenceSession { let (mut built_gf, built_result) = builder(bc); + // Build the graph + built_gf.build_forward_expand(&built_result.result); + // Allocate the graph self.allocator.allocate_graph(&built_gf); @@ -280,9 +279,6 @@ impl InferenceSession { // Write input tokens unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) }; - // Compute the graph - built_gf.build_forward_expand(&built_result.result); - #[cfg(feature = "metal")] { // FIXME can only process one token at a time currently @@ -303,7 +299,7 @@ impl InferenceSession { #[cfg(not(feature = "metal"))] { let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads); - plan.execute(ctx0); + plan.execute(&mut self.work_buffer); } // Adjust the required memory per token if we didn't know that already diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs index c6d1d8a2..f433931e 100644 --- a/crates/llm-base/src/lora.rs +++ b/crates/llm-base/src/lora.rs @@ -128,8 +128,9 @@ impl LoraAdapter { gf.build_forward_expand(&output); //TODO: maybe pass the model's thread count to this context + let mut work_buffer = vec![0u8]; let mut plan = GraphExecutionPlan::new(&mut gf, 8); - plan.execute(&patch_context); + plan.execute(&mut work_buffer); // Overwrite the original tensor. // The `output` and the `target_tensor` are not from the same context, diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index ea2530ec..61c1d196 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -166,12 +166,10 @@ impl KnownModel for Llama { let input_len = builder.embd.nelements(); let mut ctx0 = builder.ctx0.borrow_mut(); - let allocator = builder.allocator.borrow(); let embd = builder.embd; let mut input_layer = ctx0.op_get_rows(&self.wte, embd); - allocator.allocate(&input_layer); let mut gf = ctx0.create_compute_graph();