Logging + mpt tests

rustformers · Sep 30, 2023 · 8ad589b · 8ad589b
1 parent 78b0e25
commit 8ad589b
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 11 deletions.
diff --git a/binaries/llm-test/configs/mpt.json b/binaries/llm-test/configs/mpt.json
@@ -6,7 +6,7 @@
         {
             "Inference": {
                 "input": "When a llama rides a crab, ",
-                "output": "When a llama rides a crab,  the llama is called the \"crab rider\".\nThe crabs are very popular in South America, especially Brazil. They have been used as transportation for many years and they can carry up to five people at once!",
+                "output": "When a llama rides a crab,  the llama is called the \"crab rider\"\nThe Llamas are an animal that can be found in The Maze. They have no special abilities, but they do drop Llamaskin and occasionally some other items when killed by players or monsters alike (see below). It's unknown if there was ever any sort of breeding system for these animals as it seems to only exist on this one world so far; however their existence has been confirmed through player reports from multiple worlds where people claim having seen them before being able see anything else about what happened after seeing just 1-2 at most per game session which makes me believe",
                 "maximum_token_count": 128
             }
         },

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
@@ -134,6 +134,9 @@ pub const DEFAULT_EPS: f32 = 0.000005;
 /// Maximum number of nodes in a `ggml` graph.
 pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize;
 
+/// Alignment used for the Tensors in a `ggml` graph.
+pub const TENSOR_ALIGNMENT: usize = 32;
+
 /// Value overrides to use for RoPE.
 ///
 /// Formula: `theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]`

diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
@@ -88,7 +88,7 @@ impl Tensor {
         self.with_alive_ctx(|| {
             #[cfg(feature = "cublas")]
             unsafe {
-                sys::cuda::ggml_cuda_assign_buffers(self.ptr.as_ptr());
+                sys::cuda::ggml_cuda_assign_buffers_no_alloc(self.ptr.as_ptr());
             }
         })
     }

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -139,9 +139,14 @@ impl InferenceSession {
             size
         };
 
+        log::info!(
+            "Allocating {:.2} MB for KV-memory",
+            context_byte_size / (1024 * 1024)
+        );
+
         if use_gpu {
             ggml::accelerator::initialize(0);
-            ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024);
+            ggml::accelerator::set_scratch_size(0);
         }
 
         // TODO: revisit this with `Rc`, maybe? We should be able to prove that the session
@@ -156,12 +161,16 @@ impl InferenceSession {
         let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);
 
         // Allocate buffer for storing tensor and graph structs
-        // Should be 1540816
         let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES);
         let eval = Buffer::new(buf_size);
+        log::info!(
+            "Allocating {:.2} MB for eval-context",
+            buf_size / (1024 * 1024)
+        );
+
         let ctx0 = ggml::Context::new_with_buffer(eval, false);
 
-        let allocator = GraphAllocator::new_measurement(32);
+        let allocator = GraphAllocator::new_measurement(ggml::TENSOR_ALIGNMENT);
         // Set up Metal support
         #[cfg(feature = "metal")]
         let metal_context = {
@@ -217,8 +226,6 @@ impl InferenceSession {
             let ctx0 = &mut self.ctx0;
 
             // If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
-            let tensor_alignment = 32;
-
             let max_n_tokens = self.config.n_batch.min(self.context_size);
             // We assume the history is full
             let max_n_past = self.context_size - max_n_tokens;
@@ -238,12 +245,16 @@ impl InferenceSession {
             };
 
             let (mut worst_case_graph, built_result) = builder(bc);
+            // Expand the graph
             worst_case_graph.build_forward_expand(&built_result.result);
-            // Should be 73924640
-            let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment;
-            let buffer = Buffer::new(graph_size);
 
-            self.allocator.switch_buffer(buffer, tensor_alignment);
+            // Allocate the graph
+            let graph_size =
+                self.allocator.allocate_graph(&worst_case_graph) + ggml::TENSOR_ALIGNMENT;
+            log::info!("Allocating {:.2} MB for graph", graph_size / (1024 * 1024));
+            // Pre-allocate the buffer foor future use
+            let buffer = Buffer::new(graph_size);
+            self.allocator.switch_buffer(buffer, ggml::TENSOR_ALIGNMENT);
         }
 
         self.ctx0.recreate();