Working graph allocator for llama

rustformers · Sep 24, 2023 · 6ba5126 · 6ba5126
1 parent 995dd79
commit 6ba5126
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 56 deletions.
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
@@ -73,7 +73,12 @@ impl ContextInner {
 /// Controls how the context uses memory.
 pub enum ContextStorage {
     /// Use the provided buffer as memory.
-    Buffer(Buffer),
+    Buffer {
+        /// The buffer to use as memory.
+        buffer: Buffer,
+        /// Whether to allocate tensors into this buffer.
+        allocate: bool,
+    },
     /// Use the provided memory mapped file as memory.
     Mmap(Mmap),
     /// Allocate `mem_size` bytes of memory.
@@ -94,7 +99,10 @@ impl ContextStorage {
     /// Returns the `Buffer` if this is a `Buffer` variant.
     pub fn as_buffer(&self) -> Option<&Buffer> {
         match self {
-            Self::Buffer(v) => Some(v),
+            Self::Buffer {
+                buffer: v,
+                allocate: _,
+            } => Some(v),
             _ => None,
         }
     }
@@ -115,7 +123,16 @@ impl PartialEq for ContextStorage {
     fn eq(&self, other: &Self) -> bool {
         use ContextStorage::*;
         match (self, other) {
-            (Buffer(l0), Buffer(r0)) => l0 == r0,
+            (
+                Buffer {
+                    buffer: l0,
+                    allocate: l1,
+                },
+                Buffer {
+                    buffer: r0,
+                    allocate: r1,
+                },
+            ) => l0 == r0 && l1 == r1,
             (Mmap(l0), Mmap(r0)) => l0.as_ptr() == r0.as_ptr(),
             (Allocate { mem_size: l }, Allocate { mem_size: r }) => l == r,
             _ => false,
@@ -130,10 +147,10 @@ impl Context {
     /// Creates a new [Context] with the given storage.
     pub fn new(storage: ContextStorage) -> Self {
         let init_params = match &storage {
-            ContextStorage::Buffer(buffer) => sys::ggml_init_params {
+            ContextStorage::Buffer { buffer, allocate } => sys::ggml_init_params {
                 mem_size: buffer.size(),
                 mem_buffer: buffer.data,
-                no_alloc: false,
+                no_alloc: !allocate,
             },
             ContextStorage::Mmap(mmap) => sys::ggml_init_params {
                 mem_size: mmap.len(),
@@ -160,8 +177,8 @@ impl Context {
 
     /// Creates a new [Context] with the specified buffer.
     /// The buffer will be used by GGML.
-    pub fn new_with_buffer(buffer: Buffer) -> Self {
-        Self::new(ContextStorage::Buffer(buffer))
+    pub fn new_with_buffer(buffer: Buffer, allocate: bool) -> Self {
+        Self::new(ContextStorage::Buffer { buffer, allocate })
     }
 
     /// Creates a new [Context] with the specified memory mapped file.

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
@@ -131,6 +131,9 @@ pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize;
 /// Default epsilon to use for RMS computation.
 pub const DEFAULT_EPS: f32 = 0.000005;
 
+/// Maximum number of nodes in a `ggml` graph.
+pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize;
+
 /// Value overrides to use for RoPE.
 ///
 /// Formula: `theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]`
@@ -348,26 +351,12 @@ impl GraphExecutionPlan {
         }
     }
 
-    /// Creates a [Type::I8] work buffer with size `plan.work_size` for this [GraphExecutionPlan] in the given [Context].
-    fn create_work_buffer(&mut self, context: &Context) -> Tensor {
-        context.new_tensor_1d(Type::I8, self.inner.work_size)
-    }
-
-    /// Assign a work buffer to this [GraphExecutionPlan].
-    fn assign_work_buffer(&mut self, buffer: &mut Tensor) {
-        assert!(
-            buffer.get_type() == Type::I8,
-            "Work buffer must be of type i8"
-        );
-        unsafe {
-            self.inner.work_data = buffer.data().cast();
-        }
-    }
-
     /// Execute this [GraphExecutionPlan] in the given [Context].
-    pub fn execute(&mut self, context: &Context) {
-        let mut work_buffer = self.create_work_buffer(context);
-        self.assign_work_buffer(&mut work_buffer);
+    pub fn execute(&mut self, buffer: &mut Vec<u8>) {
+        if self.inner.work_size > 0 {
+            buffer.resize(self.inner.work_size, 0);
+            self.inner.work_data = buffer.as_mut_ptr().cast();
+        }
 
         unsafe {
             sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
@@ -556,3 +545,8 @@ pub fn cpu_has_gpublas() -> bool {
 pub fn graph_overhead() -> usize {
     unsafe { sys::ggml_graph_overhead() }
 }
+
+/// Returns the tensor overhead in bytes.
+pub fn tensor_overhead() -> usize {
+    unsafe { sys::ggml_tensor_overhead() }
+}
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -83,6 +83,9 @@ pub struct InferenceSession {
 
     ///Context size of this session
     context_size: usize,
+
+    /// Work buffer for graph planing
+    work_buffer: Vec<u8>,
 }
 
 pub struct BuildContext<'session> {
@@ -146,24 +149,11 @@ impl InferenceSession {
         let n_elements = n_embd * n_mem;
         let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);
 
-        // Allocate buffer for storing intermediate values during evaluation (ctx0 backing)
-        // For the first run, we need to guess a maximum buffer size so we can measure
-        // the actual memory consumption of the temporary ggml context.
-        //
-        // These numbers are from `llama.cpp`, and could potentially be more efficient.
-        let buf_size = {
-            let buf_size_mb = if n_layer >= 80 {
-                1536
-            } else if n_layer >= 60 {
-                1280
-            } else {
-                1024
-            };
-            buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
-        };
-
+        // Allocate buffer for storing tensor and graph structs
+        // Should be 1540816
+        let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES);
         let eval = Buffer::new(buf_size);
-        let ctx0 = ggml::Context::new_with_buffer(eval);
+        let ctx0 = ggml::Context::new_with_buffer(eval, false);
 
         let allocator = GraphAllocator::new_measurement(32);
         // Set up Metal support
@@ -200,6 +190,7 @@ impl InferenceSession {
             n_embd,
             allocator,
             context_size,
+            work_buffer: vec![0],
         }
     }
 
@@ -213,12 +204,12 @@ impl InferenceSession {
     where
         F: Fn(BuildContext) -> (ComputationGraph, GraphOutputs),
     {
-        // Build a graph
-        self.ctx0.recreate();
-        let ctx0 = &mut self.ctx0;
-
         // Check if we need to allocate the graph
         if self.allocator.in_measuring_mode() {
+            // Build a graph
+            self.ctx0.recreate();
+            let ctx0 = &mut self.ctx0;
+
             // If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
             let tensor_alignment = 32;
 
@@ -240,13 +231,18 @@ impl InferenceSession {
                 n_past: max_n_past,
             };
 
-            let (worst_case_graph, _) = builder(bc);
+            let (mut worst_case_graph, built_result) = builder(bc);
+            worst_case_graph.build_forward_expand(&built_result.result);
+            // Should be 73924640
             let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment;
             let buffer = Buffer::new(graph_size);
 
             self.allocator.switch_buffer(buffer, tensor_alignment);
         }
 
+        self.ctx0.recreate();
+        let ctx0 = &mut self.ctx0;
+
         let mut embd = ctx0
             .new_tensor_1d(ggml::Type::I32, input_tokens.len())
             .set_name("embd");
@@ -266,6 +262,9 @@ impl InferenceSession {
 
         let (mut built_gf, built_result) = builder(bc);
 
+        // Build the graph
+        built_gf.build_forward_expand(&built_result.result);
+
         // Allocate the graph
         self.allocator.allocate_graph(&built_gf);
 
@@ -280,9 +279,6 @@ impl InferenceSession {
         // Write input tokens
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
-        // Compute the graph
-        built_gf.build_forward_expand(&built_result.result);
-
         #[cfg(feature = "metal")]
         {
             // FIXME can only process one token at a time currently
@@ -303,7 +299,7 @@ impl InferenceSession {
         #[cfg(not(feature = "metal"))]
         {
             let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads);
-            plan.execute(ctx0);
+            plan.execute(&mut self.work_buffer);
         }
 
         // Adjust the required memory per token if we didn't know that already

diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs
@@ -128,8 +128,9 @@ impl LoraAdapter {
         gf.build_forward_expand(&output);
 
         //TODO: maybe pass the model's thread count to this context
+        let mut work_buffer = vec![0u8];
         let mut plan = GraphExecutionPlan::new(&mut gf, 8);
-        plan.execute(&patch_context);
+        plan.execute(&mut work_buffer);
 
         // Overwrite the original tensor.
         // The `output` and the `target_tensor` are not from the same context,

diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
@@ -166,12 +166,10 @@ impl KnownModel for Llama {
             let input_len = builder.embd.nelements();
 
             let mut ctx0 = builder.ctx0.borrow_mut();
-            let allocator = builder.allocator.borrow();
 
             let embd = builder.embd;
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
-            allocator.allocate(&input_layer);
 
             let mut gf = ctx0.create_compute_graph();
+8 −6		.github/workflows/build.yml
+3 −0		CMakeLists.txt
+41 −26		Makefile
+14 −7		README.md
+13 −9		build.zig
+3 −3		common/common.cpp
+2 −3		common/common.h
+21 −18		examples/benchmark/benchmark-matmult.cpp
+1 −0		examples/embd-input/embd-input-lib.cpp
+20 −2		examples/embedding/README.md
+1 −0		examples/embedding/embedding.cpp
+271 −0		examples/llama-bench/README.md
+1 −0		examples/perplexity/perplexity.cpp
+1 −0		examples/quantize-stats/quantize-stats.cpp
+1 −0		examples/quantize/quantize.cpp
+1 −0		examples/save-load-state/save-load-state.cpp
+2 −2		examples/server/server.cpp
+2 −1		flake.nix
+99 −46		ggml-cuda.cu
+3 −3		ggml-opencl.cpp
+43 −38		llama.cpp
+1 −1		scripts/verify-checksum-models.py