Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Commit

Permalink
Logging + mpt tests
Browse files Browse the repository at this point in the history
  • Loading branch information
LLukas22 committed Sep 30, 2023
1 parent 78b0e25 commit 8ad589b
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 11 deletions.
2 changes: 1 addition & 1 deletion binaries/llm-test/configs/mpt.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
{
"Inference": {
"input": "When a llama rides a crab, ",
"output": "When a llama rides a crab,  the llama is called the \"crab rider\".\nThe crabs are very popular in South America, especially Brazil. They have been used as transportation for many years and they can carry up to five people at once!",
"output": "When a llama rides a crab,  the llama is called the \"crab rider\"\nThe Llamas are an animal that can be found in The Maze. They have no special abilities, but they do drop Llamaskin and occasionally some other items when killed by players or monsters alike (see below). It's unknown if there was ever any sort of breeding system for these animals as it seems to only exist on this one world so far; however their existence has been confirmed through player reports from multiple worlds where people claim having seen them before being able see anything else about what happened after seeing just 1-2 at most per game session which makes me believe",
"maximum_token_count": 128
}
},
Expand Down
3 changes: 3 additions & 0 deletions crates/ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ pub const DEFAULT_EPS: f32 = 0.000005;
/// Maximum number of nodes in a `ggml` graph.
pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize;

/// Alignment used for the Tensors in a `ggml` graph.
pub const TENSOR_ALIGNMENT: usize = 32;

/// Value overrides to use for RoPE.
///
/// Formula: `theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]`
Expand Down
2 changes: 1 addition & 1 deletion crates/ggml/src/tensor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ impl Tensor {
self.with_alive_ctx(|| {
#[cfg(feature = "cublas")]
unsafe {
sys::cuda::ggml_cuda_assign_buffers(self.ptr.as_ptr());
sys::cuda::ggml_cuda_assign_buffers_no_alloc(self.ptr.as_ptr());
}
})
}
Expand Down
29 changes: 20 additions & 9 deletions crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,14 @@ impl InferenceSession {
size
};

log::info!(
"Allocating {:.2} MB for KV-memory",
context_byte_size / (1024 * 1024)
);

if use_gpu {
ggml::accelerator::initialize(0);
ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024);
ggml::accelerator::set_scratch_size(0);
}

// TODO: revisit this with `Rc`, maybe? We should be able to prove that the session
Expand All @@ -156,12 +161,16 @@ impl InferenceSession {
let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);

// Allocate buffer for storing tensor and graph structs
// Should be 1540816
let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES);
let eval = Buffer::new(buf_size);
log::info!(
"Allocating {:.2} MB for eval-context",
buf_size / (1024 * 1024)
);

let ctx0 = ggml::Context::new_with_buffer(eval, false);

let allocator = GraphAllocator::new_measurement(32);
let allocator = GraphAllocator::new_measurement(ggml::TENSOR_ALIGNMENT);
// Set up Metal support
#[cfg(feature = "metal")]
let metal_context = {
Expand Down Expand Up @@ -217,8 +226,6 @@ impl InferenceSession {
let ctx0 = &mut self.ctx0;

// If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
let tensor_alignment = 32;

let max_n_tokens = self.config.n_batch.min(self.context_size);
// We assume the history is full
let max_n_past = self.context_size - max_n_tokens;
Expand All @@ -238,12 +245,16 @@ impl InferenceSession {
};

let (mut worst_case_graph, built_result) = builder(bc);
// Expand the graph
worst_case_graph.build_forward_expand(&built_result.result);
// Should be 73924640
let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment;
let buffer = Buffer::new(graph_size);

self.allocator.switch_buffer(buffer, tensor_alignment);
// Allocate the graph
let graph_size =
self.allocator.allocate_graph(&worst_case_graph) + ggml::TENSOR_ALIGNMENT;
log::info!("Allocating {:.2} MB for graph", graph_size / (1024 * 1024));
// Pre-allocate the buffer foor future use
let buffer = Buffer::new(graph_size);
self.allocator.switch_buffer(buffer, ggml::TENSOR_ALIGNMENT);
}

self.ctx0.recreate();
Expand Down

0 comments on commit 8ad589b

Please sign in to comment.