From 7a10cfb3db89ea69e7d8f7cf225afc1890cbe8b5 Mon Sep 17 00:00:00 2001
From: Akshay <nerdy@peppe.rs>
Date: Mon, 31 Jul 2023 16:41:23 +0530
Subject: [PATCH 01/22] add bert model

Co-authored-by: Lukas Kreussel <lukaskreussel@gmail.com>
Co-authored-by: Philpax <me@philpax.me>
---
 Cargo.lock                    |  10 +
 crates/ggml/src/context.rs    |  30 +++
 crates/llm/Cargo.toml         |   4 +-
 crates/llm/src/lib.rs         |   1 +
 crates/models/bert/Cargo.toml |  14 +
 crates/models/bert/src/lib.rs | 464 ++++++++++++++++++++++++++++++++++
 6 files changed, 522 insertions(+), 1 deletion(-)
 create mode 100644 crates/models/bert/Cargo.toml
 create mode 100644 crates/models/bert/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index 9a3cc87a..272fcc85 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1262,6 +1262,7 @@ dependencies = [
  "bytesize",
  "clap",
  "llm-base",
+ "llm-bert",
  "llm-bloom",
  "llm-falcon",
  "llm-gpt2",
@@ -1297,6 +1298,15 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "llm-bert"
+version = "0.2.0-dev"
+dependencies = [
+ "bytemuck",
+ "llm-base",
+ "tracing",
+]
+
 [[package]]
 name = "llm-bloom"
 version = "0.2.0-dev"
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 9f332251..c8a00c98 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -246,6 +246,12 @@ impl Context {
     pub fn storage(&self) -> &ContextStorage {
         self.storage.as_ref().unwrap()
     }
+
+    /// Set all values of the tensor with the specified value.
+    pub fn set_f32(&self, a: &Tensor, x: f32) -> Tensor {
+        let raw = unsafe { sys::ggml_set_f32(a.ptr.as_ptr(), x) };
+        self.new_tensor_raw(raw)
+    }
 }
 // Operations
 impl Context {
@@ -598,6 +604,30 @@ impl Context {
         };
         self.new_tensor_raw(tensor)
     }
+
+    /// Creates a new tensor with the square of `a`
+    pub fn op_sqr(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { sys::ggml_sqr(self.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the square-root of `a`
+    pub fn op_sqrt(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { sys::ggml_sqrt(self.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Unknown
+    pub fn op_sum(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { sys::ggml_sum(self.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Unknown
+    pub fn op_div(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor = unsafe { sys::ggml_div(self.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
 }
 // Public to this crate methods
 impl Context {
diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml
index 0f395f5a..efff39e5 100644
--- a/crates/llm/Cargo.toml
+++ b/crates/llm/Cargo.toml
@@ -16,6 +16,7 @@ llm-bloom = { path = "../models/bloom", optional = true, version = "0.2.0-dev" }
 llm-gptneox = { path = "../models/gptneox", optional = true, version = "0.2.0-dev" }
 llm-mpt = { path = "../models/mpt", optional = true, version = "0.2.0-dev" }
 llm-falcon = { path = "../models/falcon", optional = true, version = "0.2.0-dev" }
+llm-bert = { path = "../models/bert", optional = true, version = "0.2.0-dev" }
 
 serde = { workspace = true }
 tracing = { workspace = true }
@@ -34,13 +35,14 @@ default = ["models", "tokenizers-remote"]
 
 tokenizers-remote = ["llm-base/tokenizers-remote"]
 
-models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"]
+models = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "bert"]
 llama = ["dep:llm-llama"]
 gpt2 = ["dep:llm-gpt2"]
 gptj = ["dep:llm-gptj"]
 bloom = ["dep:llm-bloom"]
 gptneox = ["dep:llm-gptneox"]
 mpt = ["dep:llm-mpt"]
+bert = ["dep:llm-bert"]
 # Falcon is off by default. See `llm_falcon`'s module documentation for more information.
 falcon = ["dep:llm-falcon"]
 
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index febe2441..14800686 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -172,6 +172,7 @@ macro_rules! define_models {
 }
 
 define_models!(
+    (bert, "bert", Bert, llm_bert, "Bert"),
     (bloom, "bloom", Bloom, llm_bloom, "BLOOM"),
     (gpt2, "gpt2", Gpt2, llm_gpt2, "GPT-2"),
     (gptj, "gptj", GptJ, llm_gptj, "GPT-J"),
diff --git a/crates/models/bert/Cargo.toml b/crates/models/bert/Cargo.toml
new file mode 100644
index 00000000..0be81b40
--- /dev/null
+++ b/crates/models/bert/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "llm-bert"
+version = "0.2.0-dev"
+license = { workspace = true }
+repository = { workspace = true }
+description = "An implementation of BERT for the `llm` ecosystem."
+edition = "2021"
+readme = "../../../README.md"
+
+[dependencies]
+bytemuck.workspace = true
+llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
+tracing = { version = "0.1", features = ["log"] }
+
diff --git a/crates/models/bert/src/lib.rs b/crates/models/bert/src/lib.rs
new file mode 100644
index 00000000..857ffcbc
--- /dev/null
+++ b/crates/models/bert/src/lib.rs
@@ -0,0 +1,464 @@
+//! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem.
+#![deny(missing_docs)]
+
+use std::{error::Error, sync::Arc};
+
+use llm_base::{
+    ggml,
+    model::{common, HyperparametersWriteError},
+    util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
+    ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
+};
+
+/// The BERT model.
+///
+/// # Safety
+/// This implements [Send] and [Sync] as it is immutable after construction.
+pub struct Bert {
+    params: ModelParameters,
+    hyperparameters: Hyperparameters,
+    tokenizer: Tokenizer,
+
+    word_embeddings: ggml::Tensor,
+    token_type_embeddings: ggml::Tensor,
+    position_embeddings: ggml::Tensor,
+    ln_e_w: ggml::Tensor,
+    ln_e_b: ggml::Tensor,
+
+    // weights for the model
+    layers: Vec<Layer>,
+
+    // must be kept alive for the model
+    context: Arc<ggml::Context>,
+}
+
+unsafe impl Send for Bert {}
+unsafe impl Sync for Bert {}
+
+/// BERT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+pub struct Hyperparameters {
+    /// Size of the model's vocabulary
+    pub n_vocab: usize,
+
+    /// Maximum number of tokens
+    pub n_max_tokens: usize,
+
+    /// Size of the model's embedding layer
+    pub n_embd: usize,
+
+    /// n_head
+    pub n_intermediate: usize,
+
+    /// Number of attention heads
+    pub n_head: usize,
+
+    /// Number of layers in the model
+    pub n_layer: usize,
+
+    /// file_type
+    pub file_type: FileType,
+}
+
+impl KnownModel for Bert {
+    type Hyperparameters = Hyperparameters;
+
+    fn new<E: Error>(
+        hyperparameters: Self::Hyperparameters,
+        params: ModelParameters,
+        tokenizer: Tokenizer,
+        tensor_loader: impl TensorLoader<E>,
+    ) -> Result<Self, E> {
+        let mut tl = tensor_loader;
+
+        let word_embeddings = tl.load("embeddings.word_embeddings.weight")?;
+        let token_type_embeddings = tl.load("embeddings.token_type_embeddings.weight")?;
+        let position_embeddings = tl.load("embeddings.position_embeddings.weight")?;
+
+        let ln_e_w = tl.load("embeddings.LayerNorm.weight")?;
+        let ln_e_b = tl.load("embeddings.LayerNorm.bias")?;
+
+        let mut layers = Vec::new();
+
+        for i in 0..hyperparameters.n_layer {
+            let backend = params.backend(i);
+
+            let layer = Layer {
+                ln_att_w: tl
+                    .load(&format!(
+                        "encoder.layer.{i}.attention.output.LayerNorm.weight"
+                    ))?
+                    .transfer_to(backend),
+                ln_att_b: tl
+                    .load(&format!(
+                        "encoder.layer.{i}.attention.output.LayerNorm.bias"
+                    ))?
+                    .transfer_to(backend),
+
+                // attention
+                q_w: tl
+                    .load(&format!("encoder.layer.{i}.attention.self.query.weight"))?
+                    .transfer_to(backend),
+                q_b: tl
+                    .load(&format!("encoder.layer.{i}.attention.self.query.bias"))?
+                    .transfer_to(backend),
+                k_w: tl
+                    .load(&format!("encoder.layer.{i}.attention.self.key.weight"))?
+                    .transfer_to(backend),
+                k_b: tl
+                    .load(&format!("encoder.layer.{i}.attention.self.key.bias"))?
+                    .transfer_to(backend),
+                v_w: tl
+                    .load(&format!("encoder.layer.{i}.attention.self.value.weight"))?
+                    .transfer_to(backend),
+                v_b: tl
+                    .load(&format!("encoder.layer.{i}.attention.self.value.bias"))?
+                    .transfer_to(backend),
+
+                o_w: tl
+                    .load(&format!("encoder.layer.{i}.attention.output.dense.weight"))?
+                    .transfer_to(backend),
+                o_b: tl
+                    .load(&format!("encoder.layer.{i}.attention.output.dense.bias"))?
+                    .transfer_to(backend),
+
+                // ff
+                ff_i_w: tl
+                    .load(&format!("encoder.layer.{i}.intermediate.dense.weight"))?
+                    .transfer_to(backend),
+                ff_i_b: tl
+                    .load(&format!("encoder.layer.{i}.intermediate.dense.bias"))?
+                    .transfer_to(backend),
+
+                ln_out_w: tl
+                    .load(&format!("encoder.layer.{i}.output.LayerNorm.weight"))?
+                    .transfer_to(backend),
+                ln_out_b: tl
+                    .load(&format!("encoder.layer.{i}.output.LayerNorm.bias"))?
+                    .transfer_to(backend),
+                ff_o_w: tl
+                    .load(&format!("encoder.layer.{i}.output.dense.weight"))?
+                    .transfer_to(backend),
+                ff_o_b: tl
+                    .load(&format!("encoder.layer.{i}.output.dense.bias"))?
+                    .transfer_to(backend),
+            };
+
+            layers.push(layer);
+        }
+        let context = tl.finish();
+
+        Ok(Self {
+            ln_e_b,
+            ln_e_w,
+            position_embeddings,
+            token_type_embeddings,
+            word_embeddings,
+            hyperparameters,
+            params,
+            tokenizer,
+            layers,
+            context: Arc::new(context),
+        })
+    }
+
+    /// Starts a new `InferenceSession` for this model.
+    fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
+        InferenceSession::new(
+            config,
+            &self.params,
+            self.hyperparameters.n_layer,
+            self.hyperparameters.n_embd,
+            self.hyperparameters.n_vocab,
+        )
+    }
+
+    #[tracing::instrument(level = "trace", skip_all)]
+    fn evaluate(
+        &self,
+        session: &mut InferenceSession,
+        input_tokens: &[TokenId],
+        output_request: &mut OutputRequest,
+    ) {
+        let input_len = input_tokens.len();
+        let _session_len = session.n_past;
+        let _ctx_size = self.params.context_size;
+
+        let Hyperparameters {
+            n_vocab,
+            n_max_tokens: _,
+            n_embd,
+            n_intermediate: _,
+            n_head,
+            n_layer,
+            file_type: _,
+        } = self.hyperparameters;
+
+        let d_head = n_embd / n_head;
+
+        let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let mut ctx0 = builder.ctx0.borrow_mut();
+            let gf = ggml::ComputationGraph::new();
+
+            let embd = builder.embd;
+
+            let mut input_layer = ctx0.op_get_rows(&self.word_embeddings, embd);
+
+            // IL = word_embeddings + token_types + position_embeddingso
+            {
+                // token-types: a zero tensor
+                let mut token_types = ctx0.new_tensor_1d(llm_base::ElementType::I32, input_len);
+                token_types.zero_data();
+
+                // position embeddings: another tensor
+                let position_buf: Vec<i32> = (0..input_len as i32).collect();
+                let mut positions = ctx0.new_tensor_1d(llm_base::ElementType::I32, input_len);
+                unsafe { positions.write_data(bytemuck::cast_slice(&position_buf)) };
+
+                // IL += token_types
+                input_layer = ctx0.op_add(
+                    &input_layer,
+                    &ctx0.op_get_rows(&self.token_type_embeddings, &token_types),
+                );
+
+                // IL += position_embeddings
+                input_layer = ctx0.op_add(
+                    &input_layer,
+                    &ctx0.op_get_rows(&self.position_embeddings, &positions),
+                );
+            }
+
+            // embd norm
+            {
+                input_layer = ctx0.op_norm(&input_layer);
+                input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_e_w), &self.ln_e_b);
+            }
+
+            for il in 0..n_layer {
+                ctx0.set_offloading(self.params.should_offload(il));
+
+                let mut current = input_layer.share();
+
+                // self-attention
+                {
+                    let q_current = ctx0.op_reshape_3d(
+                        &ctx0.op_add(
+                            &ctx0.op_mul_mat(&self.layers[il].q_w, &current),
+                            &self.layers[il].q_b,
+                        ),
+                        d_head,
+                        n_head,
+                        input_len,
+                    );
+                    let q = ctx0.op_permute(&q_current, (0, 2, 1, 3));
+
+                    let k_current = ctx0.op_reshape_3d(
+                        &ctx0.op_add(
+                            &ctx0.op_mul_mat(&self.layers[il].k_w, &current),
+                            &self.layers[il].k_b,
+                        ),
+                        d_head,
+                        n_head,
+                        input_len,
+                    );
+                    let k = ctx0.op_permute(&k_current, (0, 2, 1, 3));
+
+                    let v_current = ctx0.op_reshape_3d(
+                        &ctx0.op_add(
+                            &ctx0.op_mul_mat(&self.layers[il].v_w, &current),
+                            &self.layers[il].v_b,
+                        ),
+                        d_head,
+                        n_head,
+                        input_len,
+                    );
+                    let mut v = ctx0.op_permute(&v_current, (0, 2, 1, 3));
+
+                    let mut kq = ctx0.op_mul_mat(&k, &q);
+
+                    // TODO: look into op_scale_inplace and op_soft_max_inplace
+                    kq = ctx0.op_scale(
+                        &kq,
+                        &ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())),
+                    );
+                    kq = ctx0.op_soft_max(&kq);
+
+                    v = ctx0.op_cont(&ctx0.op_transpose(&v));
+
+                    let mut kqv = ctx0.op_mul_mat(&v, &kq);
+                    kqv = ctx0.op_permute(&kqv, (0, 2, 1, 3));
+
+                    current = ctx0.op_cpy(
+                        &kqv,
+                        &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len),
+                    );
+                }
+
+                // attention output
+                current = ctx0.op_add(
+                    &ctx0.op_mul_mat(&self.layers[il].o_w, &current),
+                    &self.layers[il].o_b,
+                );
+
+                // re-add the layer input
+                current = ctx0.op_add(&current, &input_layer);
+
+                // attention norm
+                {
+                    current = ctx0.op_norm(&current);
+                    current = ctx0.op_add(
+                        &ctx0.op_mul(&current, &self.layers[il].ln_att_w),
+                        &self.layers[il].ln_att_b,
+                    );
+                }
+
+                let att_output = current.share();
+
+                // intermediate output
+                current = ctx0.op_mul_mat(&self.layers[il].ff_i_w, &current);
+                current = ctx0.op_add(&current, &self.layers[il].ff_i_b);
+                current = ctx0.op_gelu(&current);
+
+                // layer output
+                current = ctx0.op_mul_mat(&self.layers[il].ff_o_w, &current);
+                current = ctx0.op_add(&current, &self.layers[il].ff_o_b);
+
+                // attentions bypass the intermediate layer
+                current = ctx0.op_add(&att_output, &current);
+
+                // output norm
+                {
+                    current = ctx0.op_norm(&current);
+                    current = ctx0.op_add(
+                        &ctx0.op_mul(&current, &self.layers[il].ln_out_w),
+                        &self.layers[il].ln_out_b,
+                    );
+                }
+
+                // input for next layer
+                input_layer = current;
+            }
+            input_layer = ctx0.op_cont(&ctx0.op_transpose(&input_layer));
+
+            ctx0.set_offloading(false);
+            // pooler
+            let mut sum = ctx0.new_tensor_2d(llm_base::ElementType::F32, input_len, 1);
+            sum = ctx0.set_f32(&sum, 1.0 / (input_len as f32));
+            input_layer = ctx0.op_mul_mat(&input_layer, &sum);
+
+            // normalizer
+            let length = ctx0.op_sqrt(&ctx0.op_sum(&ctx0.op_sqr(&input_layer)));
+
+            input_layer = ctx0.op_scale(&input_layer, &ctx0.op_div(&ctx0.new_f32(1.0), &length));
+
+            (
+                gf,
+                GraphOutputs {
+                    result: input_layer.share(),
+                    embedding_result: input_layer.share(),
+                },
+            )
+        });
+
+        // finish evaluation
+        common::read_last_token(session, &outputs.result, n_vocab, input_len);
+        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
+        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, 1);
+    }
+
+    fn hyperparameters(&self) -> &Self::Hyperparameters {
+        &self.hyperparameters
+    }
+
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
+    }
+
+    fn context_size(&self) -> usize {
+        self.params.context_size
+    }
+
+    fn bot_token_id(&self) -> Option<TokenId> {
+        self.tokenizer.id("[PAD]".as_bytes())
+    }
+
+    fn eot_token_id(&self) -> TokenId {
+        self.tokenizer.id("</s>".as_bytes()).unwrap_or(2)
+    }
+
+    fn quantize_tensors() -> Vec<Regex> {
+        vec![Regex::new(".*weight").unwrap()]
+    }
+
+    fn skip_quantize_tensors() -> Vec<Regex> {
+        vec![]
+    }
+
+    fn supports_rewind(&self) -> bool {
+        true
+    }
+}
+
+impl llm_base::Hyperparameters for Hyperparameters {
+    fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result<Self, LoadError> {
+        Ok(Hyperparameters {
+            n_vocab: util::read_i32(reader)?.try_into()?,
+            n_max_tokens: util::read_i32(reader)?.try_into()?,
+            n_embd: util::read_i32(reader)?.try_into()?,
+            n_intermediate: util::read_i32(reader)?.try_into()?,
+            n_head: util::read_i32(reader)?.try_into()?,
+            n_layer: util::read_i32(reader)?.try_into()?,
+            file_type: util::read_filetype(reader)?,
+        })
+    }
+
+    fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> {
+        util::write_i32(writer, self.n_vocab.try_into()?)?;
+        util::write_i32(writer, self.n_max_tokens.try_into()?)?;
+        util::write_i32(writer, self.n_embd.try_into()?)?;
+        util::write_i32(writer, self.n_intermediate.try_into()?)?;
+        util::write_i32(writer, self.n_head.try_into()?)?;
+        util::write_i32(writer, self.n_layer.try_into()?)?;
+        util::write_i32(writer, self.file_type.into())?;
+        Ok(())
+    }
+
+    fn n_vocabulary(&self) -> usize {
+        self.n_vocab
+    }
+
+    fn file_type(&self) -> Option<FileType> {
+        Some(self.file_type)
+    }
+
+    fn file_type_mut(&mut self) -> Option<&mut FileType> {
+        Some(&mut self.file_type)
+    }
+}
+
+struct Layer {
+    // normalization
+    ln_att_w: ggml::Tensor,
+    ln_att_b: ggml::Tensor,
+
+    ln_out_w: ggml::Tensor,
+    ln_out_b: ggml::Tensor,
+
+    // attention
+    q_w: ggml::Tensor,
+    q_b: ggml::Tensor,
+    k_w: ggml::Tensor,
+    k_b: ggml::Tensor,
+    v_w: ggml::Tensor,
+    v_b: ggml::Tensor,
+
+    o_w: ggml::Tensor,
+    o_b: ggml::Tensor,
+
+    // ff
+    ff_i_w: ggml::Tensor,
+    ff_i_b: ggml::Tensor,
+
+    ff_o_w: ggml::Tensor,
+    ff_o_b: ggml::Tensor,
+}

From b9b1391ee8233853990e036cb001a52a06ea7ce0 Mon Sep 17 00:00:00 2001
From: Andrew <andrey255@live.com>
Date: Wed, 23 Aug 2023 14:59:59 +0200
Subject: [PATCH 02/22] Remove error on context window overflow

---
 crates/llm-base/src/inference_session.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index e3f5a785..e8dbb1c2 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -381,9 +381,10 @@ impl InferenceSession {
         output_request: &mut OutputRequest,
         rng: &mut impl rand::Rng,
     ) -> Result<Vec<u8>, InferenceError> {
-        if self.n_past + 1 >= model.context_size() {
-            return Err(InferenceError::ContextFull);
-        }
+        // disable error throw on context size overflow to use llama.cpp "context window slide" (if it exists).
+        // if self.n_past + 1 >= model.context_size() {
+        //     return Err(InferenceError::ContextFull);
+        // }
 
         let next_token = crate::samplers::sample_token(
             params.sampler.clone(),

From 99a9fb4dfefa455b983c016a89f54618d062469a Mon Sep 17 00:00:00 2001
From: Andrii Kotliar <andrey255@live.com>
Date: Tue, 12 Sep 2023 14:40:26 +0200
Subject: [PATCH 03/22] Add "context swap" functions to session and add
 "decoded_tokens" to snapshot read/write

---
 crates/llm-base/src/inference_session.rs | 217 ++++++++++++++++++++---
 1 file changed, 191 insertions(+), 26 deletions(-)

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index e8dbb1c2..12f66f5e 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -299,15 +299,22 @@ impl InferenceSession {
         output_request: &mut OutputRequest,
         mut callback: impl FnMut(&[u8]) -> Result<InferenceFeedback, E>,
     ) -> Result<(), InferenceError> {
-        let beginning_of_sentence = self.n_past == 0;
-
-        let vocab = model.tokenizer();
-        let prompt_tokens = prompt.into().to_tokens(vocab, beginning_of_sentence)?;
+        let prompt_tokens = self.get_prompt_tokens(model, prompt)?;
 
         if self.n_past + prompt_tokens.len() >= model.context_size() {
             return Err(InferenceError::ContextFull);
         }
 
+        self.feed_prompt_tokens(model, output_request, &mut callback, prompt_tokens)
+    }
+
+    fn feed_prompt_tokens<E: std::error::Error + Send + Sync + 'static>(
+        &mut self,
+        model: &dyn Model,
+        output_request: &mut OutputRequest,
+        mut callback: impl FnMut(&[u8]) -> Result<InferenceFeedback, E>,
+        prompt_tokens: Vec<TokenId>,
+    ) -> Result<(), InferenceError> {
         'outer: for batch in prompt_tokens.chunks(self.config.n_batch) {
             model.evaluate(self, batch, output_request);
             for &tk in batch {
@@ -341,10 +348,46 @@ impl InferenceSession {
             }
         }
         log::trace!("Finished feed prompt");
-
         Ok(())
     }
 
+    fn get_prompt_tokens<'a, P: Into<Prompt<'a>>>(
+        &self,
+        model: &dyn Model,
+        prompt: P,
+    ) -> Result<Vec<TokenId>, TokenizationError> {
+        let beginning_of_sentence = self.n_past == 0;
+
+        let vocab = model.tokenizer();
+        prompt.into().to_tokens(vocab, beginning_of_sentence)
+    }
+
+    /// Feed a prompt to the model for this session.
+    /// Same as [Self::feed_prompt] but includes logic for cutting tokens in case if the prompt is longer than current n_past.
+    #[instrument(skip_all)]
+    pub fn feed_prompt_with_swap<
+        'a,
+        E: std::error::Error + Send + Sync + 'static,
+        P: Into<Prompt<'a>>,
+    >(
+        &mut self,
+        model: &dyn Model,
+        prompt: P,
+        n_keep: usize,
+        output_request: &mut OutputRequest,
+        mut callback: impl FnMut(&[u8]) -> Result<InferenceFeedback, E>,
+    ) -> Result<(), InferenceError> {
+        let prompt_tokens = self.get_prompt_tokens(model, prompt)?;
+
+        if self.n_past + prompt_tokens.len() >= model.context_size() {
+            let rewind_by = self.n_past + prompt_tokens.len() - model.context_size();
+            self.remove_tokens(model, n_keep, rewind_by)
+                .map_err(|_e| InferenceError::ContextFull)?;
+        }
+
+        self.feed_prompt_tokens(model, output_request, &mut callback, prompt_tokens)
+    }
+
     /// Removes `num` tokens from the end of the buffer. Roughly the inverse of `feed_prompt`.
     pub fn rewind(&mut self, model: &dyn Model, num: usize) -> Result<Vec<TokenId>, RewindError> {
         if !model.supports_rewind() {
@@ -372,6 +415,46 @@ impl InferenceSession {
         Ok(deleted_tokens)
     }
 
+    /// Removes `num` tokens from the specified position of the buffer. Similar to [Self::rewind].
+    fn remove_tokens(
+        &mut self,
+        model: &dyn Model,
+        start_from: usize,
+        num: usize,
+    ) -> Result<Vec<TokenId>, RewindError> {
+        if !model.supports_rewind() {
+            return Err(RewindError::UnsupportedArchitecture);
+        }
+
+        if start_from + num >= self.n_past {
+            return Err(RewindError::NotEnoughTokens);
+        }
+
+        // Remove the tokens from self.tokens.
+        let end = start_from + num;
+        let deleted_tokens: Vec<_> = self.tokens.drain(start_from..end).collect();
+
+        // Remove the corresponding chars from decoded
+        let mut decoded_start = 0;
+        let mut decoded_end = 0;
+        if start_from != 0 {
+            for id in &self.tokens[0..start_from] {
+                decoded_start += model.tokenizer().token(*id as usize).len();
+            }
+            decoded_end += decoded_start;
+        }
+
+        for id in &deleted_tokens {
+            decoded_end += model.tokenizer().token(*id as usize).len();
+        }
+        self.decoded_tokens.drain(decoded_start..decoded_end);
+
+        // Decrement the n_past tokens counter.
+        self.n_past -= num;
+
+        Ok(deleted_tokens)
+    }
+
     /// Infer the next token for this session.
     #[instrument(level = "trace", skip_all)]
     pub fn infer_next_token(
@@ -381,10 +464,9 @@ impl InferenceSession {
         output_request: &mut OutputRequest,
         rng: &mut impl rand::Rng,
     ) -> Result<Vec<u8>, InferenceError> {
-        // disable error throw on context size overflow to use llama.cpp "context window slide" (if it exists).
-        // if self.n_past + 1 >= model.context_size() {
-        //     return Err(InferenceError::ContextFull);
-        // }
+        if self.n_past + 1 >= model.context_size() {
+            return Err(InferenceError::ContextFull);
+        }
 
         let next_token = crate::samplers::sample_token(
             params.sampler.clone(),
@@ -438,19 +520,7 @@ impl InferenceSession {
     ) -> Result<InferenceStats, InferenceError> {
         let maximum_token_count = request.maximum_token_count.unwrap_or(usize::MAX);
         if request.play_back_previous_tokens {
-            // "Play back" the existing tokens, so that loading from an inference snapshot works
-            // as expected.
-            let mut token_utf8_buf = TokenUtf8Buffer::new();
-            for token_id in &self.tokens {
-                // Buffer the token until it's valid UTF-8, then call the callback.
-                if let Some(tokens) =
-                    token_utf8_buf.push(&model.tokenizer().token(*token_id as usize))
-                {
-                    if let Err(e) = callback(InferenceResponse::SnapshotToken(tokens)) {
-                        return Err(InferenceError::UserCallback(Box::new(e)));
-                    }
-                }
-            }
+            self.play_back_previous_tokens(model, &mut callback)?
         }
         log::trace!(
             "Starting inference request with max_token_count: {}",
@@ -475,10 +545,25 @@ impl InferenceSession {
         stats.feed_prompt_duration = start_at.elapsed().unwrap();
         stats.prompt_tokens = self.n_past;
 
-        // After the prompt is consumed, sample tokens by repeatedly calling
-        // `infer_next_token`. We generate tokens until the model returns an
-        // EndOfText token, or we run out of space in the context window,
-        // or we reach the specified limit.
+        self.infer_tokens(model, rng, &mut callback, maximum_token_count, parameters)?;
+        stats.predict_duration = start_at.elapsed().unwrap();
+        stats.predict_tokens = self.n_past;
+
+        Ok(stats)
+    }
+
+    /// sample tokens by repeatedly calling
+    /// [Self::infer_next_token]. Generate tokens until the model returns an
+    /// EndOfText token, or we run out of space in the context window,
+    /// or we reach the specified limit.
+    fn infer_tokens<E: std::error::Error + Send + Sync + 'static>(
+        &mut self,
+        model: &dyn Model,
+        rng: &mut impl rand::Rng,
+        mut callback: impl FnMut(InferenceResponse) -> Result<InferenceFeedback, E>,
+        maximum_token_count: usize,
+        parameters: &InferenceParameters,
+    ) -> Result<(), InferenceError> {
         let mut tokens_processed = 0;
         let mut token_utf8_buf = TokenUtf8Buffer::new();
         while tokens_processed < maximum_token_count {
@@ -502,6 +587,79 @@ impl InferenceSession {
 
             tokens_processed += 1;
         }
+        Ok(())
+    }
+
+    /// "Play back" the existing tokens, so that loading from an inference snapshot works
+    /// as expected.
+    fn play_back_previous_tokens<E: std::error::Error + Send + Sync + 'static>(
+        &mut self,
+        model: &dyn Model,
+        mut callback: impl FnMut(InferenceResponse) -> Result<InferenceFeedback, E>,
+    ) -> Result<(), InferenceError> {
+        let mut token_utf8_buf = TokenUtf8Buffer::new();
+        for token_id in &self.tokens {
+            // Buffer the token until it's valid UTF-8, then call the callback.
+            if let Some(tokens) = token_utf8_buf.push(&model.tokenizer().token(*token_id as usize))
+            {
+                if let Err(e) = callback(InferenceResponse::SnapshotToken(tokens)) {
+                    return Err(InferenceError::UserCallback(Box::new(e)));
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Generate text by using the provided [Model] to evaluate the `prompt`.
+    /// Works the same way as [Self::infer] except has infinite text generation via context swapping
+    #[instrument(skip_all)]
+    pub fn infer_with_swap<E: std::error::Error + Send + Sync + 'static>(
+        &mut self,
+        model: &dyn Model,
+        rng: &mut impl rand::Rng,
+        request: &InferenceRequest,
+        n_keep: usize,
+        output_request: &mut OutputRequest,
+        mut callback: impl FnMut(InferenceResponse) -> Result<InferenceFeedback, E>,
+    ) -> Result<InferenceStats, InferenceError> {
+        let maximum_token_count = request.maximum_token_count.unwrap_or(usize::MAX);
+        if request.play_back_previous_tokens {
+            self.play_back_previous_tokens(model, &mut callback)?
+        }
+
+        // infinite text generation via context swapping
+        // if we run out of context:
+        // - take the n_keep first tokens from the original prompt
+        // - remove half of the tokens after n_keep ((n_ctx - n_keep) / 2)
+        if self.n_past >= model.context_size() {
+            self.remove_tokens(model, n_keep, (self.n_past - n_keep) / 2)
+                .map_err(|_e| InferenceError::ContextFull)?;
+        }
+
+        log::trace!(
+            "Starting inference request with max_token_count: {}",
+            maximum_token_count
+        );
+
+        let mut stats = InferenceStats::default();
+        let start_at = std::time::SystemTime::now();
+
+        let parameters = request.parameters;
+
+        // Feed the initial prompt through the transformer, to update its
+        // context window with new data, if necessary.
+        if !request.prompt.is_empty() {
+            self.feed_prompt(
+                model,
+                request.prompt,
+                output_request,
+                feed_prompt_callback(&mut callback),
+            )?;
+        }
+        stats.feed_prompt_duration = start_at.elapsed().unwrap();
+        stats.prompt_tokens = self.n_past;
+
+        self.infer_tokens(model, rng, &mut callback, maximum_token_count, parameters)?;
         stats.predict_duration = start_at.elapsed().unwrap();
         stats.predict_tokens = self.n_past;
 
@@ -605,6 +763,7 @@ impl InferenceSession {
             npast: self.n_past,
             config: self.config,
             tokens: self.tokens.clone(),
+            decoded_tokens: self.decoded_tokens.clone(),
             logits: self.last_logits.clone(),
             memory_k,
             memory_v,
@@ -637,6 +796,7 @@ impl InferenceSession {
 
         session.n_past = snapshot.npast;
         session.tokens = snapshot.tokens;
+        session.decoded_tokens = snapshot.decoded_tokens;
         session.last_logits = snapshot.last_logits;
 
         Ok(session)
@@ -742,6 +902,8 @@ pub struct InferenceSnapshotRef<'a> {
     pub config: InferenceSessionConfig,
     /// All tokens generated by this inference session.
     pub tokens: Vec<TokenId>,
+    /// All decoded tokens generated by this inference session.
+    pub decoded_tokens: Vec<u8>,
     /// The vector of logits that was produced after the last inference.
     pub logits: Vec<f32>,
     /// The contents of the 'key' memory tensor.
@@ -760,6 +922,7 @@ impl InferenceSnapshotRef<'_> {
             npast: self.npast,
             config: self.config,
             tokens: self.tokens.clone(),
+            decoded_tokens: self.decoded_tokens.clone(),
             last_logits: self.logits.clone(),
             memory_k: self.memory_k.to_vec(),
             memory_v: self.memory_v.to_vec(),
@@ -778,6 +941,8 @@ pub struct InferenceSnapshot {
     pub config: InferenceSessionConfig,
     /// All tokens generated by this inference session.
     pub tokens: Vec<TokenId>,
+    /// All decoded tokens generated by this inference session.
+    pub decoded_tokens: Vec<u8>,
     /// The vector of logits that was produced after the last inference.
     pub last_logits: Vec<f32>,
     /// The contents of the 'key' memory tensor.

From 6835335b6426783028741bc05f865eb16ecd30b7 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 16 Sep 2023 14:29:01 +0200
Subject: [PATCH 04/22] Build a against newer GGML version

---
 binaries/generate-ggml-bindings/src/main.rs |   2 +
 crates/ggml/src/context.rs                  |   2 +-
 crates/ggml/src/lib.rs                      |   2 +-
 crates/ggml/sys/src/cuda.rs                 |  43 +-
 crates/ggml/sys/src/lib.rs                  | 915 ++++++++++++++++----
 crates/ggml/sys/src/llama.rs                |  11 +-
 crates/ggml/sys/src/metal.rs                |  18 +-
 7 files changed, 795 insertions(+), 198 deletions(-)

diff --git a/binaries/generate-ggml-bindings/src/main.rs b/binaries/generate-ggml-bindings/src/main.rs
index 39acbb86..30991953 100644
--- a/binaries/generate-ggml-bindings/src/main.rs
+++ b/binaries/generate-ggml-bindings/src/main.rs
@@ -27,6 +27,8 @@ fn generate_main(ggml_path: &Path, src_path: &Path) {
         .allowlist_file(r".*ggml.h")
         .header(ggml_path.join("k_quants.h").to_string_lossy())
         .allowlist_file(r".*k_quants.h")
+        .header(ggml_path.join("ggml-alloc.h").to_string_lossy())
+        .allowlist_file(r".*ggml-alloc.h")
         // Suppress some warnings
         .raw_line("#![allow(non_upper_case_globals)]")
         .raw_line("#![allow(non_camel_case_types)]")
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 2f2d04f0..6df8c4e4 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -288,7 +288,7 @@ impl Context {
 
     /// Creates a new tensor with the values of `a`, but normalized.
     pub fn op_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_norm(self.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_norm(self.as_ptr(), a.ptr.as_ptr(), crate::DEFAULT_EPS) };
         self.new_tensor_raw(tensor)
     }
 
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 8b6910eb..51160515 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -129,7 +129,7 @@ pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;
 pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize;
 
 /// Default epsilon to use for RMS computation.
-pub const DEFAULT_EPS: f32 = sys::llama::LLAMA_DEFAULT_RMS_EPS as f32;
+pub const DEFAULT_EPS: f32 = 0.000005;
 
 /// Value overrides to use for RoPE.
 ///
diff --git a/crates/ggml/sys/src/cuda.rs b/crates/ggml/sys/src/cuda.rs
index a9ae1a8d..5208b66e 100644
--- a/crates/ggml/sys/src/cuda.rs
+++ b/crates/ggml/sys/src/cuda.rs
@@ -3,15 +3,17 @@
 use super::ggml_compute_params;
 use super::ggml_tensor;
 
+pub const GGML_CUDA_NAME: &[u8; 5usize] = b"CUDA\0";
+pub const GGML_CUBLAS_NAME: &[u8; 7usize] = b"cuBLAS\0";
 pub const GGML_CUDA_MAX_DEVICES: u32 = 16;
 extern "C" {
     pub fn ggml_init_cublas();
 }
 extern "C" {
-    pub fn ggml_cuda_set_tensor_split(tensor_split: *const f32);
+    pub fn ggml_cuda_host_malloc(size: usize) -> *mut ::std::os::raw::c_void;
 }
 extern "C" {
-    pub fn ggml_cuda_mul(src0: *const ggml_tensor, src1: *const ggml_tensor, dst: *mut ggml_tensor);
+    pub fn ggml_cuda_host_free(ptr: *mut ::std::os::raw::c_void);
 }
 extern "C" {
     pub fn ggml_cuda_can_mul_mat(
@@ -21,26 +23,7 @@ extern "C" {
     ) -> bool;
 }
 extern "C" {
-    pub fn ggml_cuda_mul_mat_get_wsize(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-    ) -> usize;
-}
-extern "C" {
-    pub fn ggml_cuda_mul_mat(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-        wdata: *mut ::std::os::raw::c_void,
-        wsize: usize,
-    );
-}
-extern "C" {
-    pub fn ggml_cuda_host_malloc(size: usize) -> *mut ::std::os::raw::c_void;
-}
-extern "C" {
-    pub fn ggml_cuda_host_free(ptr: *mut ::std::os::raw::c_void);
+    pub fn ggml_cuda_set_tensor_split(tensor_split: *const f32);
 }
 extern "C" {
     pub fn ggml_cuda_transform_tensor(data: *mut ::std::os::raw::c_void, tensor: *mut ggml_tensor);
@@ -57,6 +40,12 @@ extern "C" {
 extern "C" {
     pub fn ggml_cuda_assign_buffers_force_inplace(tensor: *mut ggml_tensor);
 }
+extern "C" {
+    pub fn ggml_cuda_assign_buffers_no_alloc(tensor: *mut ggml_tensor);
+}
+extern "C" {
+    pub fn ggml_cuda_assign_scratch_offset(tensor: *mut ggml_tensor, offset: usize);
+}
 extern "C" {
     pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int);
 }
@@ -75,3 +64,13 @@ extern "C" {
         tensor: *mut ggml_tensor,
     ) -> bool;
 }
+extern "C" {
+    pub fn ggml_cuda_get_device_count() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cuda_get_device_description(
+        device: ::std::os::raw::c_int,
+        description: *mut ::std::os::raw::c_char,
+        description_size: usize,
+    );
+}
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index 77b47802..884ef75b 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -22,12 +22,17 @@ pub const GGML_MAX_NODES: u32 = 4096;
 pub const GGML_MAX_PARAMS: u32 = 256;
 pub const GGML_MAX_CONTEXTS: u32 = 64;
 pub const GGML_MAX_SRC: u32 = 6;
-pub const GGML_MAX_NAME: u32 = 48;
+pub const GGML_MAX_NAME: u32 = 64;
 pub const GGML_MAX_OP_PARAMS: u32 = 32;
 pub const GGML_DEFAULT_N_THREADS: u32 = 4;
+pub const GGML_MEM_ALIGN: u32 = 16;
 pub const GGML_EXIT_SUCCESS: u32 = 0;
 pub const GGML_EXIT_ABORTED: u32 = 1;
+pub const GGUF_MAGIC: u32 = 1179993927;
+pub const GGUF_VERSION: u32 = 2;
+pub const GGUF_DEFAULT_ALIGNMENT: u32 = 32;
 pub const GGML_GRAPH_HASHTABLE_SIZE: u32 = 8273;
+pub const GGML_N_TASKS_MAX: i32 = -1;
 pub const QK_K: u32 = 256;
 pub const K_SCALE_SIZE: u32 = 12;
 pub type ggml_fp16_t = u16;
@@ -103,49 +108,58 @@ pub const ggml_op_GGML_OP_MEAN: ggml_op = 13;
 pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14;
 pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15;
 pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16;
-pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 17;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 18;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19;
-pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 20;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 21;
-pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 22;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 23;
-pub const ggml_op_GGML_OP_SET: ggml_op = 24;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 25;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 26;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 27;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 28;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 29;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 30;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 31;
-pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 32;
-pub const ggml_op_GGML_OP_DIAG: ggml_op = 33;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 34;
-pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 35;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 36;
-pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 37;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 38;
-pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 39;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 40;
-pub const ggml_op_GGML_OP_CLAMP: ggml_op = 41;
-pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 42;
-pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 43;
-pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 44;
-pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 45;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 46;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 47;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 48;
-pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 49;
-pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 50;
-pub const ggml_op_GGML_OP_UNARY: ggml_op = 51;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 52;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 53;
-pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 54;
-pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 55;
-pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 56;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 57;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 58;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 59;
+pub const ggml_op_GGML_OP_CONCAT: ggml_op = 17;
+pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 18;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 19;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 20;
+pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 21;
+pub const ggml_op_GGML_OP_GROUP_NORM: ggml_op = 22;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 23;
+pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 24;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 25;
+pub const ggml_op_GGML_OP_SET: ggml_op = 26;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 27;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 28;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 29;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 30;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 31;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 32;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 33;
+pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 34;
+pub const ggml_op_GGML_OP_DIAG: ggml_op = 35;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 36;
+pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 37;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 38;
+pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 39;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 40;
+pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 41;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 42;
+pub const ggml_op_GGML_OP_CLAMP: ggml_op = 43;
+pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 44;
+pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 45;
+pub const ggml_op_GGML_OP_CONV_TRANSPOSE_2D: ggml_op = 46;
+pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 47;
+pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 48;
+pub const ggml_op_GGML_OP_UPSCALE: ggml_op = 49;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 50;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 51;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 52;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 53;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 54;
+pub const ggml_op_GGML_OP_GET_REL_POS: ggml_op = 55;
+pub const ggml_op_GGML_OP_ADD_REL_POS: ggml_op = 56;
+pub const ggml_op_GGML_OP_UNARY: ggml_op = 57;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 58;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 59;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1_F32: ggml_op = 60;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2_F32: ggml_op = 61;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3_F32: ggml_op = 62;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 68;
 pub type ggml_op = ::std::os::raw::c_uint;
 pub const ggml_unary_op_GGML_UNARY_OP_ABS: ggml_unary_op = 0;
 pub const ggml_unary_op_GGML_UNARY_OP_SGN: ggml_unary_op = 1;
@@ -253,8 +267,10 @@ pub struct ggml_tensor {
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
+    pub view_src: *mut ggml_tensor,
+    pub view_offs: usize,
     pub data: *mut ::std::os::raw::c_void,
-    pub name: [::std::os::raw::c_char; 48usize],
+    pub name: [::std::os::raw::c_char; 64usize],
     pub extra: *mut ::std::os::raw::c_void,
     pub padding: [::std::os::raw::c_char; 4usize],
 }
@@ -264,7 +280,7 @@ fn bindgen_test_layout_ggml_tensor() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_tensor>(),
-        272usize,
+        304usize,
         concat!("Size of: ", stringify!(ggml_tensor))
     );
     assert_eq!(
@@ -403,8 +419,28 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).view_src) as usize - ptr as usize },
         200usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_src)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).view_offs) as usize - ptr as usize },
+        208usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_offs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        216usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -414,7 +450,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).name) as usize - ptr as usize },
-        208usize,
+        224usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -424,7 +460,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize },
-        256usize,
+        288usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -434,7 +470,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        264usize,
+        296usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -443,7 +479,7 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
 }
-pub const GGML_TENSOR_SIZE: usize = 272;
+pub const GGML_TENSOR_SIZE: usize = 304;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cplan {
@@ -867,6 +903,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_nbytes(tensor: *const ggml_tensor) -> usize;
 }
+extern "C" {
+    pub fn ggml_nbytes_pad(tensor: *const ggml_tensor) -> usize;
+}
 extern "C" {
     pub fn ggml_nbytes_split(
         tensor: *const ggml_tensor,
@@ -909,6 +948,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_is_permuted(tensor: *const ggml_tensor) -> bool;
 }
+extern "C" {
+    pub fn ggml_are_same_shape(t0: *const ggml_tensor, t1: *const ggml_tensor) -> bool;
+}
 extern "C" {
     pub fn ggml_tensor_overhead() -> usize;
 }
@@ -991,7 +1033,7 @@ extern "C" {
     pub fn ggml_dup_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *mut ggml_tensor) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_get_tensor(
@@ -1187,6 +1229,13 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_concat(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1256,10 +1305,14 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        eps: f32,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32)
@@ -1272,11 +1325,26 @@ extern "C" {
         eps: f32,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_group_norm(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_group_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rms_norm_back(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
+        eps: f32,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1591,6 +1659,16 @@ extern "C" {
         freq_scale: f32,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_rope_xpos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_past: ::std::os::raw::c_int,
+        n_dims: ::std::os::raw::c_int,
+        base: f32,
+        down: bool,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rope_back(
         ctx: *mut ggml_context,
@@ -1599,6 +1677,10 @@ extern "C" {
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        freq_base: f32,
+        freq_scale: f32,
+        xpos_base: f32,
+        xpos_down: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1628,6 +1710,15 @@ extern "C" {
         d0: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_conv_1d_ph(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        s: ::std::os::raw::c_int,
+        d: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_conv_2d(
         ctx: *mut ggml_context,
@@ -1642,12 +1733,25 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d_ph(
+    pub fn ggml_conv_2d_sk_p0(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
-        s: ::std::os::raw::c_int,
-        d: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_2d_s1_ph(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_transpose_2d_p0(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        stride: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
@@ -1677,6 +1781,13 @@ extern "C" {
         p1: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_upscale(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        scale_factor: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_flash_attn(
         ctx: *mut ggml_context,
@@ -1722,6 +1833,44 @@ extern "C" {
         w: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_unary(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_unary_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        qh: ::std::os::raw::c_int,
+        kh: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 pub type ggml_unary_op_f32_t = ::std::option::Option<
     unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32),
 >;
@@ -1750,20 +1899,6 @@ pub type ggml_custom3_op_f32_t = ::std::option::Option<
         arg4: *const ggml_tensor,
     ),
 >;
-extern "C" {
-    pub fn ggml_unary(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_unary_inplace(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
 extern "C" {
     pub fn ggml_map_unary_f32(
         ctx: *mut ggml_context,
@@ -1842,6 +1977,96 @@ extern "C" {
         fun: ggml_custom3_op_f32_t,
     ) -> *mut ggml_tensor;
 }
+pub type ggml_custom1_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom2_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom3_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        c: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+extern "C" {
+    pub fn ggml_map_custom1(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom1_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_cross_entropy_loss(
         ctx: *mut ggml_context,
@@ -1863,6 +2088,14 @@ extern "C" {
 extern "C" {
     pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);
 }
+extern "C" {
+    pub fn ggml_build_backward_expand(
+        ctx: *mut ggml_context,
+        gf: *mut ggml_cgraph,
+        gb: *mut ggml_cgraph,
+        keep: bool,
+    );
+}
 extern "C" {
     pub fn ggml_build_forward(tensor: *mut ggml_tensor) -> ggml_cgraph;
 }
@@ -1952,6 +2185,8 @@ pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_STEP: ggml_opt_result = -126;
 pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_ITERATIONS: ggml_opt_result = -125;
 pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result = -124;
 pub type ggml_opt_result = ::std::os::raw::c_int;
+pub type ggml_opt_callback =
+    ::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void, sched: *mut f32)>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_params {
@@ -1971,12 +2206,14 @@ pub struct ggml_opt_params__bindgen_ty_1 {
     pub n_iter: ::std::os::raw::c_int,
     pub sched: f32,
     pub decay: f32,
+    pub decay_min_ndim: ::std::os::raw::c_int,
     pub alpha: f32,
     pub beta1: f32,
     pub beta2: f32,
     pub eps: f32,
     pub eps_f: f32,
     pub eps_g: f32,
+    pub gclip: f32,
 }
 #[test]
 fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
@@ -1985,7 +2222,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params__bindgen_ty_1>(),
-        36usize,
+        44usize,
         concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_1))
     );
     assert_eq!(
@@ -2024,8 +2261,18 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).decay_min_ndim) as usize - ptr as usize },
         12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(decay_min_ndim)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2035,7 +2282,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta1) as usize - ptr as usize },
-        16usize,
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2045,7 +2292,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta2) as usize - ptr as usize },
-        20usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2055,7 +2302,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
-        24usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2065,7 +2312,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_f) as usize - ptr as usize },
-        28usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2075,7 +2322,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_g) as usize - ptr as usize },
-        32usize,
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2083,6 +2330,16 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
             stringify!(eps_g)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gclip) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(gclip)
+        )
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -2209,7 +2466,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params>(),
-        96usize,
+        104usize,
         concat!("Size of: ", stringify!(ggml_opt_params))
     );
     assert_eq!(
@@ -2299,7 +2556,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
-        60usize,
+        68usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2316,19 +2573,16 @@ pub struct ggml_opt_context {
     pub iter: ::std::os::raw::c_int,
     pub nx: i64,
     pub just_initialized: bool,
+    pub loss_before: f32,
+    pub loss_after: f32,
     pub adam: ggml_opt_context__bindgen_ty_1,
     pub lbfgs: ggml_opt_context__bindgen_ty_2,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_context__bindgen_ty_1 {
-    pub x: *mut ggml_tensor,
-    pub g1: *mut ggml_tensor,
-    pub g2: *mut ggml_tensor,
     pub m: *mut ggml_tensor,
     pub v: *mut ggml_tensor,
-    pub mh: *mut ggml_tensor,
-    pub vh: *mut ggml_tensor,
     pub pf: *mut ggml_tensor,
     pub fx_best: f32,
     pub fx_prev: f32,
@@ -2341,7 +2595,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_context__bindgen_ty_1>(),
-        80usize,
+        40usize,
         concat!("Size of: ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
@@ -2350,113 +2604,63 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
         concat!("Alignment of ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).x) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(x)
+            stringify!(m)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g1) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
         8usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g1)
+            stringify!(v)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g2) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
         16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g2)
+            stringify!(pf)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).fx_best) as usize - ptr as usize },
         24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(m)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(v)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mh) as usize - ptr as usize },
-        40usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(mh)
+            stringify!(fx_best)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vh) as usize - ptr as usize },
-        48usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).fx_prev) as usize - ptr as usize },
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(vh)
+            stringify!(fx_prev)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
-        56usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).n_no_improvement) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(pf)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).fx_best) as usize - ptr as usize },
-        64usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(fx_best)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).fx_prev) as usize - ptr as usize },
-        68usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(fx_prev)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_no_improvement) as usize - ptr as usize },
-        72usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(n_no_improvement)
+            stringify!(n_no_improvement)
         )
     );
 }
@@ -2662,7 +2866,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_context>(),
-        312usize,
+        288usize,
         concat!("Size of: ", stringify!(ggml_opt_context))
     );
     assert_eq!(
@@ -2692,7 +2896,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).iter) as usize - ptr as usize },
-        104usize,
+        112usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2702,7 +2906,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nx) as usize - ptr as usize },
-        112usize,
+        120usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2712,7 +2916,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).just_initialized) as usize - ptr as usize },
-        120usize,
+        128usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2720,9 +2924,29 @@ fn bindgen_test_layout_ggml_opt_context() {
             stringify!(just_initialized)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_before) as usize - ptr as usize },
+        132usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_before)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_after) as usize - ptr as usize },
+        136usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_after)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
-        128usize,
+        144usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2732,7 +2956,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
-        208usize,
+        184usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2773,6 +2997,8 @@ extern "C" {
         f: *mut ggml_tensor,
         gf: *mut ggml_cgraph,
         gb: *mut ggml_cgraph,
+        callback: ggml_opt_callback,
+        callback_data: *mut ::std::os::raw::c_void,
     ) -> ggml_opt_result;
 }
 extern "C" {
@@ -2830,6 +3056,282 @@ extern "C" {
         hist: *mut i64,
     ) -> usize;
 }
+pub const gguf_type_GGUF_TYPE_UINT8: gguf_type = 0;
+pub const gguf_type_GGUF_TYPE_INT8: gguf_type = 1;
+pub const gguf_type_GGUF_TYPE_UINT16: gguf_type = 2;
+pub const gguf_type_GGUF_TYPE_INT16: gguf_type = 3;
+pub const gguf_type_GGUF_TYPE_UINT32: gguf_type = 4;
+pub const gguf_type_GGUF_TYPE_INT32: gguf_type = 5;
+pub const gguf_type_GGUF_TYPE_FLOAT32: gguf_type = 6;
+pub const gguf_type_GGUF_TYPE_BOOL: gguf_type = 7;
+pub const gguf_type_GGUF_TYPE_STRING: gguf_type = 8;
+pub const gguf_type_GGUF_TYPE_ARRAY: gguf_type = 9;
+pub const gguf_type_GGUF_TYPE_UINT64: gguf_type = 10;
+pub const gguf_type_GGUF_TYPE_INT64: gguf_type = 11;
+pub const gguf_type_GGUF_TYPE_FLOAT64: gguf_type = 12;
+pub const gguf_type_GGUF_TYPE_COUNT: gguf_type = 13;
+pub type gguf_type = ::std::os::raw::c_int;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct gguf_context {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct gguf_init_params {
+    pub no_alloc: bool,
+    pub ctx: *mut *mut ggml_context,
+}
+#[test]
+fn bindgen_test_layout_gguf_init_params() {
+    const UNINIT: ::std::mem::MaybeUninit<gguf_init_params> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<gguf_init_params>(),
+        16usize,
+        concat!("Size of: ", stringify!(gguf_init_params))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<gguf_init_params>(),
+        8usize,
+        concat!("Alignment of ", stringify!(gguf_init_params))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).no_alloc) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(gguf_init_params),
+            "::",
+            stringify!(no_alloc)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).ctx) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(gguf_init_params),
+            "::",
+            stringify!(ctx)
+        )
+    );
+}
+extern "C" {
+    pub fn gguf_init_empty() -> *mut gguf_context;
+}
+extern "C" {
+    pub fn gguf_init_from_file(
+        fname: *const ::std::os::raw::c_char,
+        params: gguf_init_params,
+    ) -> *mut gguf_context;
+}
+extern "C" {
+    pub fn gguf_free(ctx: *mut gguf_context);
+}
+extern "C" {
+    pub fn gguf_type_name(type_: gguf_type) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_version(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_alignment(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_data_offset(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_data(ctx: *const gguf_context) -> *mut ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_n_kv(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_find_key(
+        ctx: *const gguf_context,
+        key: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_key(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_kv_type(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> gguf_type;
+}
+extern "C" {
+    pub fn gguf_get_arr_type(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> gguf_type;
+}
+extern "C" {
+    pub fn gguf_get_val_u8(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u8;
+}
+extern "C" {
+    pub fn gguf_get_val_i8(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i8;
+}
+extern "C" {
+    pub fn gguf_get_val_u16(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u16;
+}
+extern "C" {
+    pub fn gguf_get_val_i16(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i16;
+}
+extern "C" {
+    pub fn gguf_get_val_u32(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u32;
+}
+extern "C" {
+    pub fn gguf_get_val_i32(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i32;
+}
+extern "C" {
+    pub fn gguf_get_val_f32(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> f32;
+}
+extern "C" {
+    pub fn gguf_get_val_u64(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u64;
+}
+extern "C" {
+    pub fn gguf_get_val_i64(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i64;
+}
+extern "C" {
+    pub fn gguf_get_val_f64(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> f64;
+}
+extern "C" {
+    pub fn gguf_get_val_bool(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> bool;
+}
+extern "C" {
+    pub fn gguf_get_val_str(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_arr_n(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_arr_data(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_arr_str(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_n_tensors(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_find_tensor(
+        ctx: *const gguf_context,
+        name: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_tensor_offset(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_tensor_name(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_set_val_u8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u8);
+}
+extern "C" {
+    pub fn gguf_set_val_i8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i8);
+}
+extern "C" {
+    pub fn gguf_set_val_u16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u16);
+}
+extern "C" {
+    pub fn gguf_set_val_i16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i16);
+}
+extern "C" {
+    pub fn gguf_set_val_u32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u32);
+}
+extern "C" {
+    pub fn gguf_set_val_i32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i32);
+}
+extern "C" {
+    pub fn gguf_set_val_f32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f32);
+}
+extern "C" {
+    pub fn gguf_set_val_u64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u64);
+}
+extern "C" {
+    pub fn gguf_set_val_i64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i64);
+}
+extern "C" {
+    pub fn gguf_set_val_f64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f64);
+}
+extern "C" {
+    pub fn gguf_set_val_bool(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: bool);
+}
+extern "C" {
+    pub fn gguf_set_val_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        val: *const ::std::os::raw::c_char,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_data(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        type_: gguf_type,
+        data: *const ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        data: *mut *const ::std::os::raw::c_char,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_kv(ctx: *mut gguf_context, src: *mut gguf_context);
+}
+extern "C" {
+    pub fn gguf_add_tensor(ctx: *mut gguf_context, tensor: *const ggml_tensor);
+}
+extern "C" {
+    pub fn gguf_set_tensor_type(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        type_: ggml_type,
+    );
+}
+extern "C" {
+    pub fn gguf_set_tensor_data(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        data: *const ::std::os::raw::c_void,
+        size: usize,
+    );
+}
+extern "C" {
+    pub fn gguf_write_to_file(
+        ctx: *const gguf_context,
+        fname: *const ::std::os::raw::c_char,
+        only_meta: bool,
+    );
+}
+extern "C" {
+    pub fn gguf_get_meta_size(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_meta_data(ctx: *const gguf_context, data: *mut ::std::os::raw::c_void);
+}
 extern "C" {
     pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
 }
@@ -2854,6 +3356,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
 }
+extern "C" {
+    pub fn ggml_cpu_has_metal() -> ::std::os::raw::c_int;
+}
 extern "C" {
     pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
 }
@@ -2878,6 +3383,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
 }
+extern "C" {
+    pub fn ggml_cpu_has_ssse3() -> ::std::os::raw::c_int;
+}
 extern "C" {
     pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
 }
@@ -2898,6 +3406,10 @@ pub type ggml_vec_dot_t = ::std::option::Option<
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_type_traits_t {
+    pub type_name: *const ::std::os::raw::c_char,
+    pub blck_size: ::std::os::raw::c_int,
+    pub type_size: usize,
+    pub is_quantized: bool,
     pub to_float: ggml_to_float_t,
     pub from_float: ggml_from_float_t,
     pub from_float_reference: ggml_from_float_t,
@@ -2910,7 +3422,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_type_traits_t>(),
-        40usize,
+        72usize,
         concat!("Size of: ", stringify!(ggml_type_traits_t))
     );
     assert_eq!(
@@ -2919,8 +3431,48 @@ fn bindgen_test_layout_ggml_type_traits_t() {
         concat!("Alignment of ", stringify!(ggml_type_traits_t))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).type_name) as usize - ptr as usize },
         0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_name)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).blck_size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(blck_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).is_quantized) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(is_quantized)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2930,7 +3482,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize },
-        8usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2940,7 +3492,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize },
-        16usize,
+        48usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2950,7 +3502,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize },
-        24usize,
+        56usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2960,7 +3512,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
-        32usize,
+        64usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2970,7 +3522,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
 }
 extern "C" {
-    pub fn ggml_internal_get_type_traits(i: ggml_type) -> ggml_type_traits_t;
+    pub fn ggml_internal_get_type_traits(type_: ggml_type) -> ggml_type_traits_t;
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -3513,3 +4065,40 @@ extern "C" {
         hist: *mut i64,
     ) -> usize;
 }
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_allocr {
+    _unused: [u8; 0],
+}
+extern "C" {
+    pub fn ggml_allocr_new(
+        data: *mut ::std::os::raw::c_void,
+        size: usize,
+        alignment: usize,
+    ) -> *mut ggml_allocr;
+}
+extern "C" {
+    pub fn ggml_allocr_new_measure(alignment: usize) -> *mut ggml_allocr;
+}
+extern "C" {
+    pub fn ggml_allocr_set_parse_seq(
+        alloc: *mut ggml_allocr,
+        list: *const ::std::os::raw::c_int,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn ggml_allocr_free(alloc: *mut ggml_allocr);
+}
+extern "C" {
+    pub fn ggml_allocr_is_measure(alloc: *mut ggml_allocr) -> bool;
+}
+extern "C" {
+    pub fn ggml_allocr_reset(alloc: *mut ggml_allocr);
+}
+extern "C" {
+    pub fn ggml_allocr_alloc(alloc: *mut ggml_allocr, tensor: *mut ggml_tensor);
+}
+extern "C" {
+    pub fn ggml_allocr_alloc_graph(alloc: *mut ggml_allocr, graph: *mut ggml_cgraph) -> usize;
+}
diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs
index a8aa42ef..5d06fd4f 100644
--- a/crates/ggml/sys/src/llama.rs
+++ b/crates/ggml/sys/src/llama.rs
@@ -1,18 +1,10 @@
 /* automatically generated by rust-bindgen 0.65.1 */
 
 pub const LLAMA_MAX_DEVICES: u32 = 1;
-pub const LLAMA_FILE_MAGIC_GGJT: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_GGLA: u32 = 1734831201;
-pub const LLAMA_FILE_MAGIC_GGMF: u32 = 1734831462;
-pub const LLAMA_FILE_MAGIC_GGML: u32 = 1734831468;
+pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
 pub const LLAMA_FILE_MAGIC_GGSN: u32 = 1734833006;
-pub const LLAMA_FILE_VERSION: u32 = 3;
-pub const LLAMA_FILE_MAGIC: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468;
 pub const LLAMA_SESSION_MAGIC: u32 = 1734833006;
 pub const LLAMA_SESSION_VERSION: u32 = 1;
-pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
-pub const LLAMA_DEFAULT_RMS_EPS: f64 = 0.000005;
 pub const LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
 pub const LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
 pub const LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;
@@ -30,4 +22,5 @@ pub const LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15;
 pub const LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
 pub const LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
 pub const LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
+pub const LLAMA_FTYPE_GUESSED: llama_ftype = 1024;
 pub type llama_ftype = ::std::os::raw::c_int;
diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
index bbd16034..95a8f506 100644
--- a/crates/ggml/sys/src/metal.rs
+++ b/crates/ggml/sys/src/metal.rs
@@ -1,6 +1,7 @@
 /* automatically generated by rust-bindgen 0.65.1 */
 
 pub const GGML_METAL_MAX_BUFFERS: u32 = 16;
+pub const GGML_METAL_MAX_COMMAND_BUFFERS: u32 = 32;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_tensor {
@@ -22,6 +23,12 @@ extern "C" {
 extern "C" {
     pub fn ggml_metal_free(ctx: *mut ggml_metal_context);
 }
+extern "C" {
+    pub fn ggml_metal_host_malloc(n: usize) -> *mut ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn ggml_metal_host_free(data: *mut ::std::os::raw::c_void);
+}
 extern "C" {
     pub fn ggml_metal_set_n_cb(ctx: *mut ggml_metal_context, n_cb: ::std::os::raw::c_int);
 }
@@ -41,10 +48,17 @@ extern "C" {
     pub fn ggml_metal_get_tensor(ctx: *mut ggml_metal_context, t: *mut ggml_tensor);
 }
 extern "C" {
-    pub fn ggml_metal_graph_find_concurrency(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph);
+    pub fn ggml_metal_graph_find_concurrency(
+        ctx: *mut ggml_metal_context,
+        gf: *mut ggml_cgraph,
+        check_mem: bool,
+    );
+}
+extern "C" {
+    pub fn ggml_metal_if_optimized(ctx: *mut ggml_metal_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_metal_if_optimized(ctx: *mut ggml_metal_context) -> bool;
+    pub fn ggml_metal_get_concur_list(ctx: *mut ggml_metal_context) -> *mut ::std::os::raw::c_int;
 }
 extern "C" {
     pub fn ggml_metal_graph_compute(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph);

From 1eb0d79464b5755885c001d99e40505f1fedd741 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 16 Sep 2023 14:35:43 +0200
Subject: [PATCH 05/22] Update llama-cpp

---
 crates/ggml/sys/llama-cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index 8183159c..b08e75ba 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit 8183159cf3def112f6d1fe94815fce70e1bffa12
+Subproject commit b08e75baea294e366628b898e85c0bd359b58115

From ad136e1452c444c795180eddecdfdec2b22877b8 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 16 Sep 2023 14:57:08 +0200
Subject: [PATCH 06/22] Include `ggml-alloc.c` during build

---
 crates/ggml/sys/build.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs
index f69cee3b..4a1c3c10 100644
--- a/crates/ggml/sys/build.rs
+++ b/crates/ggml/sys/build.rs
@@ -12,8 +12,13 @@ fn main() {
     let mut builder = cc::Build::new();
 
     let build = builder
-        .files(["llama-cpp/ggml.c", "llama-cpp/k_quants.c"])
+        .files([
+            "llama-cpp/ggml.c",
+            "llama-cpp/k_quants.c",
+            "llama-cpp/ggml-alloc.c",
+        ])
         .define("GGML_USE_K_QUANTS", None)
+        .define("QK_K", Some("256"))
         .includes(["llama-cpp"]);
 
     // This is a very basic heuristic for applying compile flags.

From cd97c9df8050e3f96cc80e78f354e4c411c9ae2d Mon Sep 17 00:00:00 2001
From: Olexiy Buyanskyy <olexiyb@gmail.com>
Date: Wed, 20 Sep 2023 15:14:50 +0300
Subject: [PATCH 07/22] Fix rewind crash  n_past can be bigger then
 tokens.len() it should not be used to drain tokens array

---
 crates/llm-base/src/inference_session.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 67408b34..8c86b0e1 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -360,7 +360,7 @@ impl InferenceSession {
         }
 
         // Remove the tokens from self.tokens.
-        let token_start = self.n_past - num;
+        let token_start = self.tokens.len() - num;
         let deleted_tokens: Vec<_> = self.tokens.drain(token_start..).collect();
 
         // Remove the corresponding chars from decoded

From ab381c76c9f8ad82bb41133d8c7d36cc21afbe1b Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Fri, 22 Sep 2023 22:02:31 +0200
Subject: [PATCH 08/22] Hopefully fix linux build

---
 crates/ggml/sys/build.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs
index 889100ff..799f1671 100644
--- a/crates/ggml/sys/build.rs
+++ b/crates/ggml/sys/build.rs
@@ -92,6 +92,10 @@ fn main() {
         _ => {}
     }
 
+    if compiler.is_like_gnu() && target_os == "linux" {
+        build.define("_GNU_SOURCE", None);
+    }
+
     if is_release {
         build.define("NDEBUG", None);
     }

From 4ebb16e08f1941aec4a019a6065e26fe239fc4f6 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 23 Sep 2023 11:56:28 +0200
Subject: [PATCH 09/22] Remove Scratch Buffers

---
 crates/ggml/src/context.rs               | 22 -------------------
 crates/ggml/src/lib.rs                   |  4 +---
 crates/llm-base/src/inference_session.rs | 28 ------------------------
 crates/models/falcon/src/lib.rs          |  7 ------
 crates/models/gpt2/src/lib.rs            |  7 +-----
 crates/models/gptneox/src/lib.rs         | 10 ---------
 crates/models/llama/src/lib.rs           |  7 ------
 crates/models/mpt/src/lib.rs             | 11 ----------
 8 files changed, 2 insertions(+), 94 deletions(-)

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 6df8c4e4..6f7a593f 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -206,28 +206,6 @@ impl Context {
         unsafe { sys::ggml_used_mem(self.as_ptr()) }
     }
 
-    /// Sets the scratch buffer to be used by this [Context].
-    ///
-    /// If `scratch_buffer` is `None`, the scratch buffer will be disabled.
-    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) {
-        let (size, data) = if let Some(buffer) = scratch_buffer {
-            (buffer.size(), buffer.data)
-        } else {
-            (0, std::ptr::null_mut())
-        };
-        // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
-        unsafe {
-            sys::ggml_set_scratch(
-                self.as_ptr(),
-                sys::ggml_scratch {
-                    offs: 0,
-                    size,
-                    data,
-                },
-            );
-        }
-    }
-
     /// Creates a new 1D tensor.
     pub fn new_tensor_1d(&self, typ: Type, ne0: usize) -> Tensor {
         let raw = unsafe { sys::ggml_new_tensor_1d(self.as_ptr(), typ.into(), usize_to_i64(ne0)) };
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 51160515..d0e35f2c 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -280,9 +280,7 @@ impl Type {
     }
 }
 
-/// A buffer of memory that can be used as a scratch buffer for a [Context].
-///
-/// See [Context::use_scratch].
+/// A buffer of memory that can be used as a buffer for a [Context].
 #[derive(PartialEq, Eq)]
 pub struct Buffer {
     data: *mut c_void,
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 67408b34..799d818f 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -12,21 +12,6 @@ use crate::{
     TokenId, TokenUtf8Buffer, TokenizationError,
 };
 
-// The size of a scratch buffer used for inference. This is used for temporary
-// storage of intermediate results during inference.
-//
-// The specific value was copied from `llama.cpp`.
-const SCRATCH_SIZE: usize = 512 * 1024 * 1024;
-
-type ScratchBuffers = [ggml::Buffer; 2];
-
-fn scratch_buffers() -> ScratchBuffers {
-    [
-        ggml::Buffer::new(SCRATCH_SIZE),
-        ggml::Buffer::new(SCRATCH_SIZE),
-    ]
-}
-
 /// Result of graph building
 pub struct GraphOutputs {
     /// The output containing the model's result
@@ -89,8 +74,6 @@ pub struct InferenceSession {
     ctx0: Context,
 
     n_embd: usize,
-
-    scratch: ScratchBuffers,
 }
 
 pub struct BuildContext<'session> {
@@ -99,13 +82,6 @@ pub struct BuildContext<'session> {
     pub embd: &'session Tensor,
     pub memory_k: &'session Tensor,
     pub memory_v: &'session Tensor,
-    pub scratch: &'session ScratchBuffers,
-}
-
-impl<'session> BuildContext<'session> {
-    pub fn get_scratch(&self, idx: usize) -> Option<&Buffer> {
-        Some(&self.scratch[idx])
-    }
 }
 
 unsafe impl Send for InferenceSession {}
@@ -159,8 +135,6 @@ impl InferenceSession {
         let n_elements = n_embd * n_mem;
         let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);
 
-        let scratch = scratch_buffers();
-
         // Allocate buffer for storing intermediate values during evaluation (ctx0 backing)
         // For the first run, we need to guess a maximum buffer size so we can measure
         // the actual memory consumption of the temporary ggml context.
@@ -212,7 +186,6 @@ impl InferenceSession {
             metal_context,
             ctx0,
             n_embd,
-            scratch,
         }
     }
 
@@ -238,7 +211,6 @@ impl InferenceSession {
             embd: &embd,
             memory_k: &self.memory_k,
             memory_v: &self.memory_v,
-            scratch: &mut self.scratch,
         };
         let (mut built_gf, built_result) = builder(bc);
 
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 0322e2f2..48edc107 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -192,7 +192,6 @@ impl KnownModel for Falcon {
 
             for il in 0..n_layer {
                 // attention uses first scratch buffer
-                ctx0.use_scratch(builder.get_scratch(0));
                 ctx0.set_offloading(self.params.should_offload(il));
 
                 // self-attention
@@ -319,9 +318,6 @@ impl KnownModel for Falcon {
                 // projection
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
 
-                // feed forward uses second scratch buffer
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 let inp_ff = layernorm_output.share();
                 let attn_out =
                     ctx0.op_cpy(&current, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n));
@@ -336,8 +332,6 @@ impl KnownModel for Falcon {
                 input_layer = current.share();
             }
 
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // norm
             input_layer = ctx0.op_norm(&input_layer);
 
@@ -349,7 +343,6 @@ impl KnownModel for Falcon {
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
             ctx0.set_offloading(false);
-            ctx0.use_scratch(None);
 
             // lm_head
             input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer);
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index b4434ad5..5a6df70c 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -174,7 +174,7 @@ impl KnownModel for Gpt2 {
             let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
                 ctx0.set_offloading(self.params.should_offload(il));
-                ctx0.use_scratch(builder.get_scratch(0));
+
                 // norm
                 let mut current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_add(
@@ -281,8 +281,6 @@ impl KnownModel for Gpt2 {
                 // feed-forward
                 let ff_in = current.share();
 
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 // feed-forward normalization
                 current = ctx0.op_norm(&ff_in);
                 current = ctx0.op_add(
@@ -305,13 +303,10 @@ impl KnownModel for Gpt2 {
                 input_layer = ctx0.op_add(&current, &ff_in);
             }
 
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // normalization
             input_layer = ctx0.op_norm(&input_layer);
             input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b);
 
-            ctx0.use_scratch(None);
             ctx0.set_offloading(false);
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 9075eb01..70b68170 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -186,8 +186,6 @@ impl KnownModel for GptNeoX {
 
             for il in 0..n_layer {
                 ctx0.set_offloading(self.params.should_offload(il));
-                // attention uses first scratch buffer
-                ctx0.use_scratch(builder.get_scratch(0));
 
                 // self-attention
                 let mut current = ctx0.op_norm(&input_layer);
@@ -301,9 +299,6 @@ impl KnownModel for GptNeoX {
                 current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, &current);
                 current = ctx0.op_add(&current, &self.layers[il].c_attn_proj_b);
 
-                // use the second scratch for the feed forward
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 let feedforward_input: Tensor;
                 if !use_parallel_residual {
                     feedforward_input = ctx0.op_add(&current, &input_layer);
@@ -326,9 +321,6 @@ impl KnownModel for GptNeoX {
                 }
             }
 
-            // use the first scratch for the norm
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // normalize the output
             input_layer = ctx0.op_norm(&input_layer);
             // inpL = ln_f_g*inpL + ln_f_b
@@ -336,8 +328,6 @@ impl KnownModel for GptNeoX {
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
-            // Disable the scratchbuffer
-            ctx0.use_scratch(None);
             ctx0.set_offloading(false);
             // apply language model head
             input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer);
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index a70f315f..0adf1d57 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -177,8 +177,6 @@ impl KnownModel for Llama {
                 let input_self_attention = input_layer.share();
                 let mut current: ggml::Tensor;
 
-                ctx0.use_scratch(builder.get_scratch(0));
-
                 // norm
                 current = ctx0.op_rms_norm(&input_layer);
 
@@ -309,8 +307,6 @@ impl KnownModel for Llama {
                 // projection (no bias)
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
 
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
 
                 // feed-forward network
@@ -337,8 +333,6 @@ impl KnownModel for Llama {
                 input_layer = current;
             }
 
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // norm
             input_layer = ctx0.op_rms_norm(&input_layer);
 
@@ -351,7 +345,6 @@ impl KnownModel for Llama {
             // lm_head
             input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
 
-            ctx0.use_scratch(None);
             (
                 gf,
                 GraphOutputs {
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 3d22efff..5d36d310 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -123,9 +123,6 @@ impl KnownModel for Mpt {
 
             let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
-                // attention uses first scratch buffer
-                ctx0.use_scratch(builder.get_scratch(0));
-
                 let mut current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_mul(&current, &self.layers[il].norm_1_weight);
 
@@ -213,9 +210,6 @@ impl KnownModel for Mpt {
 
                 input_layer = ctx0.op_add(&input_layer, &current);
 
-                // feed forward uses second scratch buffer
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_mul(&current, &self.layers[il].norm_2_weight);
 
@@ -229,17 +223,12 @@ impl KnownModel for Mpt {
                 input_layer = ctx0.op_add(&input_layer, &current);
             }
 
-            //use scratch buffer 0 for the rest
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // norm
             input_layer = ctx0.op_norm(&input_layer);
             input_layer = ctx0.op_mul(&input_layer, &self.norm);
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
-            // disable scratch buffer for last layer
-            ctx0.use_scratch(None);
             // output embedding weight tied to input embedding
             input_layer = ctx0.op_mul_mat(&self.wte, &input_layer);
 

From 995dd79718f6e81f1381cfd0a357d6bb714f7515 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 23 Sep 2023 20:11:20 +0200
Subject: [PATCH 10/22] Use `GraphAllocator` in LLaMA architecture

---
 crates/ggml/src/lib.rs                   | 66 +++++++++++++++++++++++-
 crates/llm-base/src/inference_session.rs | 60 ++++++++++++++++++++-
 crates/models/bloom/src/lib.rs           |  1 +
 crates/models/falcon/src/lib.rs          |  1 +
 crates/models/gpt2/src/lib.rs            |  1 +
 crates/models/gptj/src/lib.rs            |  1 +
 crates/models/gptneox/src/lib.rs         |  1 +
 crates/models/llama/src/lib.rs           | 25 +++++++--
 crates/models/mpt/src/lib.rs             |  1 +
 9 files changed, 148 insertions(+), 9 deletions(-)

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index d0e35f2c..507b1f60 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -280,8 +280,8 @@ impl Type {
     }
 }
 
-/// A buffer of memory that can be used as a buffer for a [Context].
-#[derive(PartialEq, Eq)]
+/// A buffer of memory that can be used as a buffer for a [Context] or [GraphAllocator].
+#[derive(PartialEq, Eq, Debug)]
 pub struct Buffer {
     data: *mut c_void,
     layout: Layout,
@@ -375,6 +375,68 @@ impl GraphExecutionPlan {
     }
 }
 
+#[derive(PartialEq, Eq, Debug)]
+/// Acts as a RAII-guard over a `sys::ggml_allocr`, allocating via
+/// `ggml_allocr_new` and dropping via `ggml_allocr_free`.
+/// Used to allocate the memory used by a computational graph.
+pub struct GraphAllocator {
+    /// The underlying `sys::ggml_allocr` pointer.
+    pub ptr: *mut sys::ggml_allocr,
+    /// The buffer used by this allocator.
+    pub buffer: Buffer,
+}
+
+impl GraphAllocator {
+    /// Create a new allocator with the specified buffer.
+    pub fn new(buffer: Buffer, tensor_alignment: usize) -> Self {
+        let ptr = unsafe { sys::ggml_allocr_new(buffer.data, buffer.size(), tensor_alignment) };
+        Self { ptr, buffer }
+    }
+
+    /// Create a new allocator to measure a computational graph.
+    pub fn new_measurement(tensor_alignment: usize) -> Self {
+        let ptr = unsafe { sys::ggml_allocr_new_measure(tensor_alignment) };
+        let buffer = Buffer::new(tensor_alignment);
+        Self { ptr, buffer }
+    }
+
+    /// Allocates a computational graph in the allocator and returns the size in bytes.
+    pub fn allocate_graph(&self, graph: &ComputationGraph) -> usize {
+        unsafe { sys::ggml_allocr_alloc_graph(self.ptr, graph.inner) }
+    }
+
+    /// Resets the allocator for a new forward pass.
+    pub fn reset(&self) {
+        unsafe { sys::ggml_allocr_reset(self.ptr) }
+    }
+
+    /// Returns true if the allocator is in measuring mode.
+    pub fn in_measuring_mode(&self) -> bool {
+        unsafe { sys::ggml_allocr_is_measure(self.ptr) }
+    }
+
+    /// Allocates memory for a given tensor in the allocator.
+    pub fn allocate(&self, tensor: &Tensor) {
+        unsafe { sys::ggml_allocr_alloc(self.ptr, tensor.ptr.as_ptr()) }
+    }
+
+    /// Switches the buffer used by the allocator.
+    pub fn switch_buffer(&mut self, buffer: Buffer, tensor_alignment: usize) {
+        // Free the old allocator
+        unsafe { sys::ggml_allocr_free(self.ptr) }
+        // Create a new allocator with the new buffer
+        let ptr = unsafe { sys::ggml_allocr_new(buffer.data, buffer.size(), tensor_alignment) };
+        self.ptr = ptr;
+        self.buffer = buffer;
+    }
+}
+
+impl Drop for GraphAllocator {
+    fn drop(&mut self) {
+        unsafe { sys::ggml_allocr_free(self.ptr) }
+    }
+}
+
 /// The size of `t` as bytes.
 pub fn type_size(t: Type) -> usize {
     unsafe { sys::ggml_type_size(t.into()) }
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 799d818f..287fe51a 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -1,4 +1,4 @@
-use ggml::{Buffer, ComputationGraph, Context, GraphExecutionPlan, Tensor};
+use ggml::{Buffer, ComputationGraph, Context, GraphAllocator, GraphExecutionPlan, Tensor};
 use serde::Serialize;
 use std::{cell::RefCell, fmt::Display, sync::Arc};
 use thiserror::Error;
@@ -19,6 +19,9 @@ pub struct GraphOutputs {
 
     /// The output containing embeddings
     pub embedding_result: Tensor,
+
+    /// The length of the output
+    pub output_length: usize,
 }
 
 /// An inference session represents the state of the text generation. This holds
@@ -74,14 +77,22 @@ pub struct InferenceSession {
     ctx0: Context,
 
     n_embd: usize,
+
+    /// Allocator used by this session
+    allocator: GraphAllocator,
+
+    ///Context size of this session
+    context_size: usize,
 }
 
 pub struct BuildContext<'session> {
     //FIXME: Borrowing issue, dont know how to fix it
     pub ctx0: RefCell<&'session mut Context>,
+    pub allocator: RefCell<&'session GraphAllocator>,
     pub embd: &'session Tensor,
     pub memory_k: &'session Tensor,
     pub memory_v: &'session Tensor,
+    pub n_past: usize,
 }
 
 unsafe impl Send for InferenceSession {}
@@ -154,6 +165,7 @@ impl InferenceSession {
         let eval = Buffer::new(buf_size);
         let ctx0 = ggml::Context::new_with_buffer(eval);
 
+        let allocator = GraphAllocator::new_measurement(32);
         // Set up Metal support
         #[cfg(feature = "metal")]
         let metal_context = {
@@ -186,6 +198,8 @@ impl InferenceSession {
             metal_context,
             ctx0,
             n_embd,
+            allocator,
+            context_size,
         }
     }
 
@@ -197,23 +211,64 @@ impl InferenceSession {
         builder: F,
     ) -> GraphOutputs
     where
-        F: FnOnce(BuildContext) -> (ComputationGraph, GraphOutputs),
+        F: Fn(BuildContext) -> (ComputationGraph, GraphOutputs),
     {
         // Build a graph
         self.ctx0.recreate();
         let ctx0 = &mut self.ctx0;
+
+        // Check if we need to allocate the graph
+        if self.allocator.in_measuring_mode() {
+            // If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
+            let tensor_alignment = 32;
+
+            let max_n_tokens = self.config.n_batch.min(self.context_size);
+            // We assume the history is full
+            let max_n_past = self.context_size - max_n_tokens;
+            let embd = ctx0
+                .new_tensor_1d(ggml::Type::I32, max_n_tokens)
+                .set_name("embd");
+
+            self.allocator.allocate(&embd);
+
+            let bc = BuildContext {
+                ctx0: RefCell::new(ctx0),
+                allocator: RefCell::new(&self.allocator),
+                embd: &embd,
+                memory_k: &self.memory_k,
+                memory_v: &self.memory_v,
+                n_past: max_n_past,
+            };
+
+            let (worst_case_graph, _) = builder(bc);
+            let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment;
+            let buffer = Buffer::new(graph_size);
+
+            self.allocator.switch_buffer(buffer, tensor_alignment);
+        }
+
         let mut embd = ctx0
             .new_tensor_1d(ggml::Type::I32, input_tokens.len())
             .set_name("embd");
 
         let bc = BuildContext {
             ctx0: RefCell::new(ctx0),
+            allocator: RefCell::new(&self.allocator),
             embd: &embd,
             memory_k: &self.memory_k,
             memory_v: &self.memory_v,
+            n_past: self.n_past,
         };
+
+        // Reset the allocator
+        self.allocator.reset();
+        self.allocator.allocate(&embd);
+
         let (mut built_gf, built_result) = builder(bc);
 
+        // Allocate the graph
+        self.allocator.allocate_graph(&built_gf);
+
         // Do Metal'y stuff
         #[cfg(feature = "metal")]
         {
@@ -263,6 +318,7 @@ impl InferenceSession {
         GraphOutputs {
             result: built_result.result.share(),
             embedding_result: built_result.embedding_result.share(),
+            output_length: input_tokens.len(),
         }
     }
 
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index efa1f338..87f454fe 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -331,6 +331,7 @@ impl KnownModel for Bloom {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: input_len,
                 },
             )
         });
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 48edc107..a53e1342 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -352,6 +352,7 @@ impl KnownModel for Falcon {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: n,
                 },
             )
         });
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index 5a6df70c..2d9bc2bd 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -319,6 +319,7 @@ impl KnownModel for Gpt2 {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: input_len,
                 },
             )
         });
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index c013625a..cc5dd9b0 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -300,6 +300,7 @@ impl KnownModel for GptJ {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: input_len,
                 },
             )
         });
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 70b68170..b85a43e5 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -337,6 +337,7 @@ impl KnownModel for GptNeoX {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: n,
                 },
             )
         });
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 0adf1d57..ea2530ec 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -147,8 +147,6 @@ impl KnownModel for Llama {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -164,10 +162,16 @@ impl KnownModel for Llama {
         let n_embd_gqa = n_embd / (n_head / n_head_kv);
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let session_len = builder.n_past;
+            let input_len = builder.embd.nelements();
+
             let mut ctx0 = builder.ctx0.borrow_mut();
+            let allocator = builder.allocator.borrow();
+
             let embd = builder.embd;
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
+            allocator.allocate(&input_layer);
 
             let mut gf = ctx0.create_compute_graph();
 
@@ -350,14 +354,25 @@ impl KnownModel for Llama {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result,
+                    output_length: input_len,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 5d36d310..ba894f97 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -237,6 +237,7 @@ impl KnownModel for Mpt {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: n,
                 },
             )
         });

From 6ba5126bdca6572d8ce38bbf466e0f984f1a6785 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sun, 24 Sep 2023 21:39:30 +0200
Subject: [PATCH 11/22] Working graph allocator for `llama`

---
 crates/ggml/src/context.rs               | 31 +++++++++++----
 crates/ggml/src/lib.rs                   | 32 +++++++---------
 crates/ggml/sys/llama-cpp                |  2 +-
 crates/llm-base/src/inference_session.rs | 48 +++++++++++-------------
 crates/llm-base/src/lora.rs              |  3 +-
 crates/models/llama/src/lib.rs           |  2 -
 6 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 6f7a593f..2439e2a7 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -73,7 +73,12 @@ impl ContextInner {
 /// Controls how the context uses memory.
 pub enum ContextStorage {
     /// Use the provided buffer as memory.
-    Buffer(Buffer),
+    Buffer {
+        /// The buffer to use as memory.
+        buffer: Buffer,
+        /// Whether to allocate tensors into this buffer.
+        allocate: bool,
+    },
     /// Use the provided memory mapped file as memory.
     Mmap(Mmap),
     /// Allocate `mem_size` bytes of memory.
@@ -94,7 +99,10 @@ impl ContextStorage {
     /// Returns the `Buffer` if this is a `Buffer` variant.
     pub fn as_buffer(&self) -> Option<&Buffer> {
         match self {
-            Self::Buffer(v) => Some(v),
+            Self::Buffer {
+                buffer: v,
+                allocate: _,
+            } => Some(v),
             _ => None,
         }
     }
@@ -115,7 +123,16 @@ impl PartialEq for ContextStorage {
     fn eq(&self, other: &Self) -> bool {
         use ContextStorage::*;
         match (self, other) {
-            (Buffer(l0), Buffer(r0)) => l0 == r0,
+            (
+                Buffer {
+                    buffer: l0,
+                    allocate: l1,
+                },
+                Buffer {
+                    buffer: r0,
+                    allocate: r1,
+                },
+            ) => l0 == r0 && l1 == r1,
             (Mmap(l0), Mmap(r0)) => l0.as_ptr() == r0.as_ptr(),
             (Allocate { mem_size: l }, Allocate { mem_size: r }) => l == r,
             _ => false,
@@ -130,10 +147,10 @@ impl Context {
     /// Creates a new [Context] with the given storage.
     pub fn new(storage: ContextStorage) -> Self {
         let init_params = match &storage {
-            ContextStorage::Buffer(buffer) => sys::ggml_init_params {
+            ContextStorage::Buffer { buffer, allocate } => sys::ggml_init_params {
                 mem_size: buffer.size(),
                 mem_buffer: buffer.data,
-                no_alloc: false,
+                no_alloc: !allocate,
             },
             ContextStorage::Mmap(mmap) => sys::ggml_init_params {
                 mem_size: mmap.len(),
@@ -160,8 +177,8 @@ impl Context {
 
     /// Creates a new [Context] with the specified buffer.
     /// The buffer will be used by GGML.
-    pub fn new_with_buffer(buffer: Buffer) -> Self {
-        Self::new(ContextStorage::Buffer(buffer))
+    pub fn new_with_buffer(buffer: Buffer, allocate: bool) -> Self {
+        Self::new(ContextStorage::Buffer { buffer, allocate })
     }
 
     /// Creates a new [Context] with the specified memory mapped file.
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 507b1f60..66ed47f9 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -131,6 +131,9 @@ pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize;
 /// Default epsilon to use for RMS computation.
 pub const DEFAULT_EPS: f32 = 0.000005;
 
+/// Maximum number of nodes in a `ggml` graph.
+pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize;
+
 /// Value overrides to use for RoPE.
 ///
 /// Formula: `theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]`
@@ -348,26 +351,12 @@ impl GraphExecutionPlan {
         }
     }
 
-    /// Creates a [Type::I8] work buffer with size `plan.work_size` for this [GraphExecutionPlan] in the given [Context].
-    fn create_work_buffer(&mut self, context: &Context) -> Tensor {
-        context.new_tensor_1d(Type::I8, self.inner.work_size)
-    }
-
-    /// Assign a work buffer to this [GraphExecutionPlan].
-    fn assign_work_buffer(&mut self, buffer: &mut Tensor) {
-        assert!(
-            buffer.get_type() == Type::I8,
-            "Work buffer must be of type i8"
-        );
-        unsafe {
-            self.inner.work_data = buffer.data().cast();
-        }
-    }
-
     /// Execute this [GraphExecutionPlan] in the given [Context].
-    pub fn execute(&mut self, context: &Context) {
-        let mut work_buffer = self.create_work_buffer(context);
-        self.assign_work_buffer(&mut work_buffer);
+    pub fn execute(&mut self, buffer: &mut Vec<u8>) {
+        if self.inner.work_size > 0 {
+            buffer.resize(self.inner.work_size, 0);
+            self.inner.work_data = buffer.as_mut_ptr().cast();
+        }
 
         unsafe {
             sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
@@ -556,3 +545,8 @@ pub fn cpu_has_gpublas() -> bool {
 pub fn graph_overhead() -> usize {
     unsafe { sys::ggml_graph_overhead() }
 }
+
+/// Returns the tensor overhead in bytes.
+pub fn tensor_overhead() -> usize {
+    unsafe { sys::ggml_tensor_overhead() }
+}
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index b08e75ba..c091cdfb 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit b08e75baea294e366628b898e85c0bd359b58115
+Subproject commit c091cdfb24621710c617ea85c92fcd347d0bf340
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 287fe51a..b7495205 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -83,6 +83,9 @@ pub struct InferenceSession {
 
     ///Context size of this session
     context_size: usize,
+
+    /// Work buffer for graph planing
+    work_buffer: Vec<u8>,
 }
 
 pub struct BuildContext<'session> {
@@ -146,24 +149,11 @@ impl InferenceSession {
         let n_elements = n_embd * n_mem;
         let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);
 
-        // Allocate buffer for storing intermediate values during evaluation (ctx0 backing)
-        // For the first run, we need to guess a maximum buffer size so we can measure
-        // the actual memory consumption of the temporary ggml context.
-        //
-        // These numbers are from `llama.cpp`, and could potentially be more efficient.
-        let buf_size = {
-            let buf_size_mb = if n_layer >= 80 {
-                1536
-            } else if n_layer >= 60 {
-                1280
-            } else {
-                1024
-            };
-            buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
-        };
-
+        // Allocate buffer for storing tensor and graph structs
+        // Should be 1540816
+        let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES);
         let eval = Buffer::new(buf_size);
-        let ctx0 = ggml::Context::new_with_buffer(eval);
+        let ctx0 = ggml::Context::new_with_buffer(eval, false);
 
         let allocator = GraphAllocator::new_measurement(32);
         // Set up Metal support
@@ -200,6 +190,7 @@ impl InferenceSession {
             n_embd,
             allocator,
             context_size,
+            work_buffer: vec![0],
         }
     }
 
@@ -213,12 +204,12 @@ impl InferenceSession {
     where
         F: Fn(BuildContext) -> (ComputationGraph, GraphOutputs),
     {
-        // Build a graph
-        self.ctx0.recreate();
-        let ctx0 = &mut self.ctx0;
-
         // Check if we need to allocate the graph
         if self.allocator.in_measuring_mode() {
+            // Build a graph
+            self.ctx0.recreate();
+            let ctx0 = &mut self.ctx0;
+
             // If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
             let tensor_alignment = 32;
 
@@ -240,13 +231,18 @@ impl InferenceSession {
                 n_past: max_n_past,
             };
 
-            let (worst_case_graph, _) = builder(bc);
+            let (mut worst_case_graph, built_result) = builder(bc);
+            worst_case_graph.build_forward_expand(&built_result.result);
+            // Should be 73924640
             let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment;
             let buffer = Buffer::new(graph_size);
 
             self.allocator.switch_buffer(buffer, tensor_alignment);
         }
 
+        self.ctx0.recreate();
+        let ctx0 = &mut self.ctx0;
+
         let mut embd = ctx0
             .new_tensor_1d(ggml::Type::I32, input_tokens.len())
             .set_name("embd");
@@ -266,6 +262,9 @@ impl InferenceSession {
 
         let (mut built_gf, built_result) = builder(bc);
 
+        // Build the graph
+        built_gf.build_forward_expand(&built_result.result);
+
         // Allocate the graph
         self.allocator.allocate_graph(&built_gf);
 
@@ -280,9 +279,6 @@ impl InferenceSession {
         // Write input tokens
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
-        // Compute the graph
-        built_gf.build_forward_expand(&built_result.result);
-
         #[cfg(feature = "metal")]
         {
             // FIXME can only process one token at a time currently
@@ -303,7 +299,7 @@ impl InferenceSession {
         #[cfg(not(feature = "metal"))]
         {
             let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads);
-            plan.execute(ctx0);
+            plan.execute(&mut self.work_buffer);
         }
 
         // Adjust the required memory per token if we didn't know that already
diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs
index c6d1d8a2..f433931e 100644
--- a/crates/llm-base/src/lora.rs
+++ b/crates/llm-base/src/lora.rs
@@ -128,8 +128,9 @@ impl LoraAdapter {
         gf.build_forward_expand(&output);
 
         //TODO: maybe pass the model's thread count to this context
+        let mut work_buffer = vec![0u8];
         let mut plan = GraphExecutionPlan::new(&mut gf, 8);
-        plan.execute(&patch_context);
+        plan.execute(&mut work_buffer);
 
         // Overwrite the original tensor.
         // The `output` and the `target_tensor` are not from the same context,
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index ea2530ec..61c1d196 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -166,12 +166,10 @@ impl KnownModel for Llama {
             let input_len = builder.embd.nelements();
 
             let mut ctx0 = builder.ctx0.borrow_mut();
-            let allocator = builder.allocator.borrow();
 
             let embd = builder.embd;
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
-            allocator.allocate(&input_layer);
 
             let mut gf = ctx0.create_compute_graph();
 

From 78b0e25c7164cfa9e56cf6ac648e803432d5a0aa Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Tue, 26 Sep 2023 18:21:27 +0200
Subject: [PATCH 12/22] Scope `input_length` and `session_len` to
 `BuildContext`

---
 crates/llm-base/src/inference_session.rs |  8 +++++++-
 crates/models/bloom/src/lib.rs           | 20 +++++++++++++++-----
 crates/models/falcon/src/lib.rs          | 23 +++++++++++++++++------
 crates/models/gpt2/src/lib.rs            | 20 +++++++++++++++-----
 crates/models/gptj/src/lib.rs            | 21 ++++++++++++++++-----
 crates/models/gptneox/src/lib.rs         | 21 ++++++++++++++++-----
 crates/models/llama/src/lib.rs           |  2 +-
 crates/models/mpt/src/lib.rs             | 20 +++++++++++++++-----
 8 files changed, 102 insertions(+), 33 deletions(-)

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index b7495205..3f6dedea 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -54,7 +54,7 @@ pub struct InferenceSession {
 
     /// How many tokens have been fed into the model's working memory so far.
     #[doc(hidden)]
-    pub n_past: usize,
+    n_past: usize,
 
     /// How much memory is required per token for the temporary context used
     /// during inference.
@@ -98,6 +98,12 @@ pub struct BuildContext<'session> {
     pub n_past: usize,
 }
 
+impl<'session> BuildContext<'session> {
+    pub fn input_length(&self) -> usize {
+        self.embd.nelements()
+    }
+}
+
 unsafe impl Send for InferenceSession {}
 impl InferenceSession {
     /// Create a new InferenceSession
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index 87f454fe..fb26ff3d 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -119,8 +119,6 @@ impl KnownModel for Bloom {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -133,6 +131,8 @@ impl KnownModel for Bloom {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let session_len = builder.n_past;
+            let input_len = builder.input_length();
             let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -337,9 +337,19 @@ impl KnownModel for Bloom {
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index a53e1342..f9f6c5d7 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -156,8 +156,6 @@ impl KnownModel for Falcon {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -170,9 +168,12 @@ impl KnownModel for Falcon {
         } = self.hyperparameters;
 
         let head_dim = n_embd / n_head;
-        let n = input_len;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let input_len = builder.input_length();
+            let n = input_len;
+            let session_len = builder.n_past;
+
             let mut ctx0 = builder.ctx0.borrow_mut();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd);
@@ -358,9 +359,19 @@ impl KnownModel for Falcon {
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index 2d9bc2bd..d06eb1ec 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -141,8 +141,6 @@ impl KnownModel for Gpt2 {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -154,6 +152,8 @@ impl KnownModel for Gpt2 {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let input_len = builder.input_length();
+            let session_len = builder.n_past;
             let mut ctx0 = builder.ctx0.borrow_mut();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -325,9 +325,19 @@ impl KnownModel for Gpt2 {
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index cc5dd9b0..b4ee3d82 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -137,8 +137,6 @@ impl KnownModel for GptJ {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -151,6 +149,9 @@ impl KnownModel for GptJ {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let input_len = builder.input_length();
+            let session_len = builder.n_past;
+
             let mut ctx0 = builder.ctx0.borrow_mut();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -306,9 +307,19 @@ impl KnownModel for GptJ {
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index b85a43e5..e355fe22 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -159,8 +159,6 @@ impl KnownModel for GptNeoX {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let n = input_tokens.len();
-        let n_past = session.n_past;
         let n_ctx = self.params.context_size;
 
         let Hyperparameters {
@@ -174,6 +172,9 @@ impl KnownModel for GptNeoX {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let n = builder.input_length();
+            let n_past = builder.n_past;
+
             let mut ctx0 = builder.ctx0.borrow_mut();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
@@ -343,9 +344,19 @@ impl KnownModel for GptNeoX {
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, n);
-        common::extract_logits(output_request, &outputs.result, n_vocab, n);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 61c1d196..69ab5aa8 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -163,7 +163,7 @@ impl KnownModel for Llama {
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
             let session_len = builder.n_past;
-            let input_len = builder.embd.nelements();
+            let input_len = builder.input_length();
 
             let mut ctx0 = builder.ctx0.borrow_mut();
 
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index ba894f97..1e52d2d0 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -96,8 +96,6 @@ impl KnownModel for Mpt {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let n = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -110,6 +108,8 @@ impl KnownModel for Mpt {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let n = builder.input_length();
+            let session_len = builder.n_past;
             let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -243,9 +243,19 @@ impl KnownModel for Mpt {
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, n);
-        common::extract_logits(output_request, &outputs.result, n_vocab, n);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {

From 8ad589b9fb1e024ed58346578b57adbd66e60bb6 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 30 Sep 2023 11:12:16 +0200
Subject: [PATCH 13/22] Logging + `mpt` tests

---
 binaries/llm-test/configs/mpt.json       |  2 +-
 crates/ggml/src/lib.rs                   |  3 +++
 crates/ggml/src/tensor.rs                |  2 +-
 crates/llm-base/src/inference_session.rs | 29 ++++++++++++++++--------
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/binaries/llm-test/configs/mpt.json b/binaries/llm-test/configs/mpt.json
index 57a8bc89..c5d9d8d0 100644
--- a/binaries/llm-test/configs/mpt.json
+++ b/binaries/llm-test/configs/mpt.json
@@ -6,7 +6,7 @@
         {
             "Inference": {
                 "input": "When a llama rides a crab, ",
-                "output": "When a llama rides a crab,  the llama is called the \"crab rider\".\nThe crabs are very popular in South America, especially Brazil. They have been used as transportation for many years and they can carry up to five people at once!",
+                "output": "When a llama rides a crab,  the llama is called the \"crab rider\"\nThe Llamas are an animal that can be found in The Maze. They have no special abilities, but they do drop Llamaskin and occasionally some other items when killed by players or monsters alike (see below). It's unknown if there was ever any sort of breeding system for these animals as it seems to only exist on this one world so far; however their existence has been confirmed through player reports from multiple worlds where people claim having seen them before being able see anything else about what happened after seeing just 1-2 at most per game session which makes me believe",
                 "maximum_token_count": 128
             }
         },
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 66ed47f9..c2daccf7 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -134,6 +134,9 @@ pub const DEFAULT_EPS: f32 = 0.000005;
 /// Maximum number of nodes in a `ggml` graph.
 pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize;
 
+/// Alignment used for the Tensors in a `ggml` graph.
+pub const TENSOR_ALIGNMENT: usize = 32;
+
 /// Value overrides to use for RoPE.
 ///
 /// Formula: `theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]`
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 33d7114c..df0fe7d5 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -88,7 +88,7 @@ impl Tensor {
         self.with_alive_ctx(|| {
             #[cfg(feature = "cublas")]
             unsafe {
-                sys::cuda::ggml_cuda_assign_buffers(self.ptr.as_ptr());
+                sys::cuda::ggml_cuda_assign_buffers_no_alloc(self.ptr.as_ptr());
             }
         })
     }
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 3f6dedea..dacc67e4 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -139,9 +139,14 @@ impl InferenceSession {
             size
         };
 
+        log::info!(
+            "Allocating {:.2} MB for KV-memory",
+            context_byte_size / (1024 * 1024)
+        );
+
         if use_gpu {
             ggml::accelerator::initialize(0);
-            ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024);
+            ggml::accelerator::set_scratch_size(0);
         }
 
         // TODO: revisit this with `Rc`, maybe? We should be able to prove that the session
@@ -156,12 +161,16 @@ impl InferenceSession {
         let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);
 
         // Allocate buffer for storing tensor and graph structs
-        // Should be 1540816
         let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES);
         let eval = Buffer::new(buf_size);
+        log::info!(
+            "Allocating {:.2} MB for eval-context",
+            buf_size / (1024 * 1024)
+        );
+
         let ctx0 = ggml::Context::new_with_buffer(eval, false);
 
-        let allocator = GraphAllocator::new_measurement(32);
+        let allocator = GraphAllocator::new_measurement(ggml::TENSOR_ALIGNMENT);
         // Set up Metal support
         #[cfg(feature = "metal")]
         let metal_context = {
@@ -217,8 +226,6 @@ impl InferenceSession {
             let ctx0 = &mut self.ctx0;
 
             // If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
-            let tensor_alignment = 32;
-
             let max_n_tokens = self.config.n_batch.min(self.context_size);
             // We assume the history is full
             let max_n_past = self.context_size - max_n_tokens;
@@ -238,12 +245,16 @@ impl InferenceSession {
             };
 
             let (mut worst_case_graph, built_result) = builder(bc);
+            // Expand the graph
             worst_case_graph.build_forward_expand(&built_result.result);
-            // Should be 73924640
-            let graph_size = self.allocator.allocate_graph(&worst_case_graph) + tensor_alignment;
-            let buffer = Buffer::new(graph_size);
 
-            self.allocator.switch_buffer(buffer, tensor_alignment);
+            // Allocate the graph
+            let graph_size =
+                self.allocator.allocate_graph(&worst_case_graph) + ggml::TENSOR_ALIGNMENT;
+            log::info!("Allocating {:.2} MB for graph", graph_size / (1024 * 1024));
+            // Pre-allocate the buffer foor future use
+            let buffer = Buffer::new(graph_size);
+            self.allocator.switch_buffer(buffer, ggml::TENSOR_ALIGNMENT);
         }
 
         self.ctx0.recreate();

From e506b0b0c19f241399fe0c27aac83221eb4a4059 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 30 Sep 2023 15:14:57 +0200
Subject: [PATCH 14/22] Try to set the cuda scratch offset

---
 crates/ggml/src/accelerator/mod.rs       |  1 +
 crates/ggml/src/context.rs               |  4 +-
 crates/ggml/src/lib.rs                   | 59 ++++++++++++++++++++++--
 crates/ggml/src/tensor.rs                | 17 +++++++
 crates/ggml/sys/llama-cpp                |  2 +-
 crates/ggml/sys/src/lib.rs               | 11 +++++
 crates/ggml/sys/src/metal.rs             | 14 ++----
 crates/llm-base/src/inference_session.rs | 52 ++++++++++++++++-----
 8 files changed, 132 insertions(+), 28 deletions(-)

diff --git a/crates/ggml/src/accelerator/mod.rs b/crates/ggml/src/accelerator/mod.rs
index 2e1cef17..731de9bc 100644
--- a/crates/ggml/src/accelerator/mod.rs
+++ b/crates/ggml/src/accelerator/mod.rs
@@ -71,6 +71,7 @@ pub fn initialize(device: i32) {
         //TODO: Make this configurable
         sys::cuda::ggml_init_cublas();
         sys::cuda::ggml_cuda_set_main_device(device);
+        sys::cuda::ggml_cuda_set_mul_mat_q(true);
         let split = 1.0f32;
         sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
     }
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 2439e2a7..11c35682 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -21,7 +21,7 @@ pub struct Context {
     /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
     /// contains a `Weak` reference underneath and doesn't let you do anything
     /// with it if the underlying context has been deallocated.
-    inner: Arc<ContextInner>,
+    pub inner: Arc<ContextInner>,
 
     /// The storage for this context. This is stored so that the buffer can be dropped when the context is dropped.
     storage: Option<ContextStorage>,
@@ -31,7 +31,7 @@ pub struct Context {
 }
 
 /// Contains state shared between a context and its tensors
-pub(crate) struct ContextInner {
+pub struct ContextInner {
     pub ptr: NonNull<sys::ggml_context>,
 
     /// Offloaded tensors. Used to free them when the context is dropped.
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index c2daccf7..26bcc548 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -10,6 +10,8 @@
 use std::{
     alloc::Layout,
     os::raw::{c_int, c_void},
+    ptr::NonNull,
+    sync::Arc,
 };
 
 mod context;
@@ -308,10 +310,27 @@ impl Buffer {
         }
     }
 
+    /// Creates a new buffer of the specified size, without aligning it.
+    pub fn new_unaligned(size: usize) -> Self {
+        let layout = Layout::from_size_align(size, 1).unwrap();
+
+        unsafe {
+            Buffer {
+                data: std::alloc::alloc(layout).cast(),
+                layout,
+            }
+        }
+    }
+
     /// Returns the size of the buffer in bytes
     pub fn size(&self) -> usize {
         self.layout.size()
     }
+
+    /// Returns a pointer to the data in this buffer.
+    pub fn data(&mut self) -> *mut c_void {
+        self.data
+    }
 }
 
 impl Drop for Buffer {
@@ -337,6 +356,37 @@ impl ComputationGraph {
     pub fn build_forward_expand(&mut self, tensor: &Tensor) {
         unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
     }
+
+    /// Returns the leafs in this graph.
+    pub fn leafs(&self, context: &Context) -> Vec<Tensor> {
+        let mut wrapped_leafs: Vec<Tensor> = vec![];
+        unsafe {
+            for leaf in self.inner.as_ref().unwrap().leafs {
+                if !leaf.is_null() {
+                    wrapped_leafs.push(Tensor {
+                        ptr: NonNull::new(leaf).expect("Should not be null"),
+                        inner: Arc::downgrade(&context.inner),
+                    })
+                }
+            }
+            wrapped_leafs
+        }
+    }
+    /// Returns the nodes in this graph.
+    pub fn nodes(&self, context: &Context) -> Vec<Tensor> {
+        let mut wrapped_nodes: Vec<Tensor> = vec![];
+        unsafe {
+            for leaf in self.inner.as_ref().unwrap().leafs {
+                if !leaf.is_null() {
+                    wrapped_nodes.push(Tensor {
+                        ptr: NonNull::new(leaf).expect("Should not be null"),
+                        inner: Arc::downgrade(&context.inner),
+                    })
+                }
+            }
+            wrapped_nodes
+        }
+    }
 }
 
 /// A `ggml` execution plan. Contains the information needed to execute a computation graph.
@@ -413,13 +463,14 @@ impl GraphAllocator {
     }
 
     /// Switches the buffer used by the allocator.
-    pub fn switch_buffer(&mut self, buffer: Buffer, tensor_alignment: usize) {
+    pub fn resize_buffer(&mut self, graph_size: usize, tensor_alignment: usize) {
         // Free the old allocator
         unsafe { sys::ggml_allocr_free(self.ptr) }
+        //Resize the buffer
+        self.buffer = Buffer::new_unaligned(graph_size);
         // Create a new allocator with the new buffer
-        let ptr = unsafe { sys::ggml_allocr_new(buffer.data, buffer.size(), tensor_alignment) };
-        self.ptr = ptr;
-        self.buffer = buffer;
+        self.ptr =
+            unsafe { sys::ggml_allocr_new(self.buffer.data, self.buffer.size(), tensor_alignment) };
     }
 }
 
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index df0fe7d5..ee5354c2 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -52,6 +52,11 @@ impl Tensor {
         })
     }
 
+    /// Returns true if the 'extra' field of this tensor is set. e.g. by ggml-cuda
+    pub fn has_extras(&self) -> bool {
+        self.with_alive_ctx(|| unsafe { !self.ptr.as_ref().extra.is_null() })
+    }
+
     /// Sets the tensor's acceleration backend and moves the tensor's data to the new backend.
     pub fn transfer_to(mut self, backend: Backend) -> Tensor {
         self.with_alive_ctx_mut(|t| {
@@ -111,6 +116,18 @@ impl Tensor {
         })
     }
 
+    /// If ggml-sys is compiled with CUDA support, this function will set the tensor's scratch offset.
+    /// If not, this is a no-op.
+    #[allow(unused_variables)]
+    pub fn assign_scratch_offset(&self, offset: usize) {
+        self.with_alive_ctx(|| {
+            #[cfg(feature = "cublas")]
+            unsafe {
+                sys::cuda::ggml_cuda_assign_scratch_offset(self.ptr.as_ptr(), offset);
+            }
+        })
+    }
+
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
         Tensor {
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index c091cdfb..da040034 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit c091cdfb24621710c617ea85c92fcd347d0bf340
+Subproject commit da0400344be12074e67dcabc565140289cf7efaa
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index 884ef75b..46a2b194 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -176,6 +176,10 @@ pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
 pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
 pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
 pub type ggml_object_type = ::std::os::raw::c_int;
+pub const ggml_log_level_GGML_LOG_LEVEL_ERROR: ggml_log_level = 2;
+pub const ggml_log_level_GGML_LOG_LEVEL_WARN: ggml_log_level = 3;
+pub const ggml_log_level_GGML_LOG_LEVEL_INFO: ggml_log_level = 4;
+pub type ggml_log_level = ::std::os::raw::c_int;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_object {
@@ -2187,6 +2191,13 @@ pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result =
 pub type ggml_opt_result = ::std::os::raw::c_int;
 pub type ggml_opt_callback =
     ::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void, sched: *mut f32)>;
+pub type ggml_log_callback = ::std::option::Option<
+    unsafe extern "C" fn(
+        level: ggml_log_level,
+        text: *const ::std::os::raw::c_char,
+        user_data: *mut ::std::os::raw::c_void,
+    ),
+>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_params {
diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
index 95a8f506..cd8040ec 100644
--- a/crates/ggml/sys/src/metal.rs
+++ b/crates/ggml/sys/src/metal.rs
@@ -2,15 +2,11 @@
 
 pub const GGML_METAL_MAX_BUFFERS: u32 = 16;
 pub const GGML_METAL_MAX_COMMAND_BUFFERS: u32 = 32;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_tensor {
-    _unused: [u8; 0],
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_cgraph {
-    _unused: [u8; 0],
+extern "C" {
+    pub fn ggml_metal_log_set_callback(
+        log_callback: ggml_log_callback,
+        user_data: *mut ::std::os::raw::c_void,
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index dacc67e4..05644d53 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -86,6 +86,9 @@ pub struct InferenceSession {
 
     /// Work buffer for graph planing
     work_buffer: Vec<u8>,
+
+    /// If the session can use the gpu
+    use_gpu: bool,
 }
 
 pub struct BuildContext<'session> {
@@ -120,7 +123,7 @@ impl InferenceSession {
             ..
         } = *params;
 
-        let context_byte_size = {
+        let cache_byte_size = {
             let mut size = 0;
             size += mulf!(
                 context_size,
@@ -134,14 +137,14 @@ impl InferenceSession {
                 n_embd,
                 ggml::type_sizef(config.memory_v_type.into())
             ); // memory_v
-            size += (5 + 10 * n_layer) * 256; // object overhead
+            size += 2 * 1024 * 1024; // overhead
 
             size
         };
 
         log::info!(
             "Allocating {:.2} MB for KV-memory",
-            context_byte_size / (1024 * 1024)
+            cache_byte_size / (1024 * 1024)
         );
 
         if use_gpu {
@@ -153,7 +156,7 @@ impl InferenceSession {
         // context is only accessed from one thread at a time, but I've already spent enough
         // time on this as-is.
         #[allow(clippy::arc_with_non_send_sync)]
-        let session_ctx = Arc::new(ggml::Context::new_with_allocate(context_byte_size));
+        let session_ctx = Arc::new(ggml::Context::new_with_allocate(cache_byte_size));
 
         // Initialize key + value memory tensors
         let n_mem = n_layer * context_size;
@@ -190,7 +193,7 @@ impl InferenceSession {
 
         InferenceSession {
             _session_ctx: session_ctx,
-            _memory_size: context_byte_size,
+            _memory_size: cache_byte_size,
             config,
             memory_k,
             memory_v,
@@ -206,6 +209,7 @@ impl InferenceSession {
             allocator,
             context_size,
             work_buffer: vec![0],
+            use_gpu,
         }
     }
 
@@ -252,18 +256,26 @@ impl InferenceSession {
             let graph_size =
                 self.allocator.allocate_graph(&worst_case_graph) + ggml::TENSOR_ALIGNMENT;
             log::info!("Allocating {:.2} MB for graph", graph_size / (1024 * 1024));
-            // Pre-allocate the buffer foor future use
-            let buffer = Buffer::new(graph_size);
-            self.allocator.switch_buffer(buffer, ggml::TENSOR_ALIGNMENT);
+            // Pre-allocate the buffer for future use
+            self.allocator
+                .resize_buffer(graph_size, ggml::TENSOR_ALIGNMENT);
+
+            if self.use_gpu {
+                ggml::accelerator::set_scratch_size(graph_size);
+            }
         }
 
+        // Reset the context and allocator
         self.ctx0.recreate();
+        self.allocator.reset();
         let ctx0 = &mut self.ctx0;
 
         let mut embd = ctx0
             .new_tensor_1d(ggml::Type::I32, input_tokens.len())
             .set_name("embd");
 
+        self.allocator.allocate(&embd);
+
         let bc = BuildContext {
             ctx0: RefCell::new(ctx0),
             allocator: RefCell::new(&self.allocator),
@@ -273,10 +285,6 @@ impl InferenceSession {
             n_past: self.n_past,
         };
 
-        // Reset the allocator
-        self.allocator.reset();
-        self.allocator.allocate(&embd);
-
         let (mut built_gf, built_result) = builder(bc);
 
         // Build the graph
@@ -285,6 +293,26 @@ impl InferenceSession {
         // Allocate the graph
         self.allocator.allocate_graph(&built_gf);
 
+        #[cfg(feature = "cublas")]
+        {
+            for mut leaf in built_gf.leafs(&ctx0) {
+                if leaf.backend() == ggml::accelerator::Backend::Gpu && !leaf.has_extras() {
+                    unsafe {
+                        let offset = leaf.data().offset_from(self.allocator.buffer.data()) as usize;
+                        leaf.assign_scratch_offset(offset);
+                    }
+                }
+            }
+
+            for mut node in built_gf.nodes(&ctx0) {
+                if node.backend() == ggml::accelerator::Backend::Gpu && !node.has_extras() {
+                    unsafe {
+                        let offset = node.data().offset_from(self.allocator.buffer.data()) as usize;
+                        node.assign_scratch_offset(offset);
+                    }
+                }
+            }
+        }
         // Do Metal'y stuff
         #[cfg(feature = "metal")]
         {

From fcbfb4d970eb4b350126e700872953e8d819ec84 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 1 Nov 2023 01:13:34 +0100
Subject: [PATCH 15/22] fix(ggml): bindgen issues

---
 binaries/generate-ggml-bindings/src/main.rs |  3 +++
 crates/ggml/src/accelerator/metal.rs        | 13 ++++---------
 crates/ggml/sys/src/lib.rs                  | 10 +++++-----
 crates/ggml/sys/src/llama.rs                |  2 +-
 crates/ggml/sys/src/metal.rs                |  4 ++++
 crates/llm-base/src/inference_session.rs    |  2 +-
 crates/llm-base/src/loader.rs               | 10 +++++-----
 crates/llm-base/src/util.rs                 |  2 +-
 8 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/binaries/generate-ggml-bindings/src/main.rs b/binaries/generate-ggml-bindings/src/main.rs
index 30991953..ad73ddba 100644
--- a/binaries/generate-ggml-bindings/src/main.rs
+++ b/binaries/generate-ggml-bindings/src/main.rs
@@ -90,6 +90,9 @@ fn generate_metal(ggml_path: &Path, src_path: &Path) {
     generate_extra("metal", ggml_path, src_path, |b| {
         b.header(ggml_path.join("ggml-metal.h").to_string_lossy())
             .allowlist_file(r".*ggml-metal\.h")
+            .raw_line("use super::ggml_tensor;")
+            .raw_line("use super::ggml_log_callback;")
+            .raw_line("use super::ggml_cgraph;")
     });
 }
 
diff --git a/crates/ggml/src/accelerator/metal.rs b/crates/ggml/src/accelerator/metal.rs
index 8fced466..a15e39f1 100644
--- a/crates/ggml/src/accelerator/metal.rs
+++ b/crates/ggml/src/accelerator/metal.rs
@@ -14,8 +14,8 @@ pub struct MetalContext {
 
 impl MetalContext {
     /// Create a new Metal context
-    pub fn new(n_threads: usize) -> Self {
-        let raw = unsafe { metal::ggml_metal_init(n_threads.try_into().unwrap()) };
+    pub fn new() -> Self {
+        let raw = unsafe { metal::ggml_metal_init(1) };
 
         MetalContext {
             contexts: vec![],
@@ -83,19 +83,14 @@ impl MetalContext {
         unsafe {
             metal::ggml_metal_graph_compute(
                 self.ptr.as_ptr(),
-                graph.inner as *mut ggml_sys::ggml_cgraph as *mut metal::ggml_cgraph,
+                graph.inner as *mut ggml_sys::ggml_cgraph,
             );
         }
     }
 
     /// Reads a tensor from Metal
     pub fn get_tensor(&self, tensor: &Tensor) {
-        unsafe {
-            metal::ggml_metal_get_tensor(
-                self.ptr.as_ptr(),
-                tensor.ptr.as_ptr() as *mut metal::ggml_tensor,
-            )
-        }
+        unsafe { metal::ggml_metal_get_tensor(self.ptr.as_ptr(), tensor.ptr.as_ptr()) }
     }
 }
 
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index 46a2b194..71b34251 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -171,15 +171,15 @@ pub const ggml_unary_op_GGML_UNARY_OP_RELU: ggml_unary_op = 6;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU: ggml_unary_op = 7;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU_QUICK: ggml_unary_op = 8;
 pub const ggml_unary_op_GGML_UNARY_OP_SILU: ggml_unary_op = 9;
-pub type ggml_unary_op = ::std::os::raw::c_int;
+pub type ggml_unary_op = ::std::os::raw::c_uint;
 pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
 pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
 pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
-pub type ggml_object_type = ::std::os::raw::c_int;
+pub type ggml_object_type = ::std::os::raw::c_uint;
 pub const ggml_log_level_GGML_LOG_LEVEL_ERROR: ggml_log_level = 2;
 pub const ggml_log_level_GGML_LOG_LEVEL_WARN: ggml_log_level = 3;
 pub const ggml_log_level_GGML_LOG_LEVEL_INFO: ggml_log_level = 4;
-pub type ggml_log_level = ::std::os::raw::c_int;
+pub type ggml_log_level = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_object {
@@ -1761,7 +1761,7 @@ extern "C" {
 pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
 pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1;
 pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2;
-pub type ggml_op_pool = ::std::os::raw::c_int;
+pub type ggml_op_pool = ::std::os::raw::c_uint;
 extern "C" {
     pub fn ggml_pool_1d(
         ctx: *mut ggml_context,
@@ -3081,7 +3081,7 @@ pub const gguf_type_GGUF_TYPE_UINT64: gguf_type = 10;
 pub const gguf_type_GGUF_TYPE_INT64: gguf_type = 11;
 pub const gguf_type_GGUF_TYPE_FLOAT64: gguf_type = 12;
 pub const gguf_type_GGUF_TYPE_COUNT: gguf_type = 13;
-pub type gguf_type = ::std::os::raw::c_int;
+pub type gguf_type = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct gguf_context {
diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs
index 5d06fd4f..d3552cd9 100644
--- a/crates/ggml/sys/src/llama.rs
+++ b/crates/ggml/sys/src/llama.rs
@@ -23,4 +23,4 @@ pub const LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
 pub const LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
 pub const LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
 pub const LLAMA_FTYPE_GUESSED: llama_ftype = 1024;
-pub type llama_ftype = ::std::os::raw::c_int;
+pub type llama_ftype = ::std::os::raw::c_uint;
diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
index cd8040ec..e2d7c621 100644
--- a/crates/ggml/sys/src/metal.rs
+++ b/crates/ggml/sys/src/metal.rs
@@ -1,5 +1,9 @@
 /* automatically generated by rust-bindgen 0.65.1 */
 
+use super::ggml_tensor;
+use super::ggml_log_callback;
+use super::ggml_cgraph;
+
 pub const GGML_METAL_MAX_BUFFERS: u32 = 16;
 pub const GGML_METAL_MAX_COMMAND_BUFFERS: u32 = 32;
 extern "C" {
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 05644d53..c86ea4b0 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -178,7 +178,7 @@ impl InferenceSession {
         #[cfg(feature = "metal")]
         let metal_context = {
             if use_gpu {
-                let mut metal_context = MetalContext::new(config.n_threads);
+                let mut metal_context = MetalContext::new();
                 metal_context.add_scratch_buffer(ctx0.storage().as_buffer().unwrap());
 
                 for buf in scratch.iter() {
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index d95ed348..f049a0cd 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -29,16 +29,16 @@ pub struct FileType {
     /// The quantization version.
     pub quantization_version: u32,
 }
-impl From<FileType> for i32 {
+impl From<FileType> for u32 {
     fn from(value: FileType) -> Self {
-        (value.quantization_version * ggml::QNT_VERSION_FACTOR) as i32
+        (value.quantization_version * ggml::QNT_VERSION_FACTOR) as u32
             + ggml::sys::llama::llama_ftype::from(value.format)
     }
 }
-impl TryFrom<i32> for FileType {
+impl TryFrom<u32> for FileType {
     type Error = ();
 
-    fn try_from(value: i32) -> Result<Self, Self::Error> {
+    fn try_from(value: u32) -> Result<Self, Self::Error> {
         let format = FileTypeFormat::try_from(
             ((value as u32) % ggml::QNT_VERSION_FACTOR) as ggml::sys::llama::llama_ftype,
         )?;
@@ -252,7 +252,7 @@ pub enum LoadError {
     #[error("unsupported ftype: {0}")]
     /// The `ftype` hyperparameter had an invalid value. This usually means that the format used
     /// by this file is unrecognized by this version of `llm`.
-    UnsupportedFileType(i32),
+    UnsupportedFileType(u32),
     #[error("invalid magic number {magic} for {path:?}")]
     /// An invalid magic number was encountered during the loading process.
     InvalidMagic {
diff --git a/crates/llm-base/src/util.rs b/crates/llm-base/src/util.rs
index e63522a2..70fe2994 100644
--- a/crates/llm-base/src/util.rs
+++ b/crates/llm-base/src/util.rs
@@ -28,7 +28,7 @@ use crate::{FileType, LoadError};
 
 /// Read the filetype from a reader.
 pub fn read_filetype(reader: &mut dyn BufRead) -> Result<FileType, LoadError> {
-    let ftype = read_i32(reader)?;
+    let ftype = read_u32(reader)?;
     FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))
 }
 

From 58193c4da9cb57f94a8467143ed4173b762eac18 Mon Sep 17 00:00:00 2001
From: Nicolas Luck <nicolas@lucksus.eu>
Date: Fri, 3 Nov 2023 01:08:59 +0100
Subject: [PATCH 16/22] Fix snapshot serde: rename logits to last_logits

---
 crates/llm-base/src/inference_session.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 8c86b0e1..493513e6 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -608,7 +608,7 @@ impl InferenceSession {
             npast: self.n_past,
             config: self.config,
             tokens: self.tokens.clone(),
-            logits: self.last_logits.clone(),
+            last_logits: self.last_logits.clone(),
             memory_k,
             memory_v,
         }
@@ -746,7 +746,7 @@ pub struct InferenceSnapshotRef<'a> {
     /// All tokens generated by this inference session.
     pub tokens: Vec<TokenId>,
     /// The vector of logits that was produced after the last inference.
-    pub logits: Vec<f32>,
+    pub last_logits: Vec<f32>,
     /// The contents of the 'key' memory tensor.
     #[serde(with = "serde_bytes")]
     pub memory_k: &'a [u8],
@@ -763,7 +763,7 @@ impl InferenceSnapshotRef<'_> {
             npast: self.npast,
             config: self.config,
             tokens: self.tokens.clone(),
-            last_logits: self.logits.clone(),
+            last_logits: self.last_logits.clone(),
             memory_k: self.memory_k.to_vec(),
             memory_v: self.memory_v.to_vec(),
         }

From 5fa9bb28ce28690102e7eddd3f561a3582c55008 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Mon, 6 Nov 2023 01:34:14 -0700
Subject: [PATCH 17/22] Update to llm-samplers v0.0.7

---
 Cargo.lock                         |  5 ++-
 Cargo.toml                         |  3 +-
 binaries/llm-cli/src/cli_args.rs   |  9 ++++++
 binaries/llm-test/src/inference.rs | 10 +++---
 crates/llm-base/src/lib.rs         |  2 +-
 crates/llm-base/src/samplers.rs    | 52 +++++++++++++++++++-----------
 6 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 049d70df..e2d26e79 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1374,9 +1374,8 @@ dependencies = [
 
 [[package]]
 name = "llm-samplers"
-version = "0.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7553f60d113c9cdc6a5402456a31cd9a273bef79f6f16d8a4f7b4bedf5f754b2"
+version = "0.0.7"
+source = "git+https://github.com/KerfuffleV2/llm-samplers?branch=feat-v0.0.7#8c72d0c2838471bfbe26394694b41054bd789549"
 dependencies = [
  "anyhow",
  "num-traits",
diff --git a/Cargo.toml b/Cargo.toml
index ae5b22f7..2daf8d62 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,7 +32,8 @@ clap = { version = "4.1.8", features = ["derive"] }
 memmap2 = "0.5.10"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 tracing = { version = "0.1", features = ["log"] }
-llm-samplers = "=0.0.6"
+llm-samplers = { git = "https://github.com/KerfuffleV2/llm-samplers", branch = "feat-v0.0.7" }
+# llm-samplers = "=0.0.6"
 
 # Config for 'cargo dist'
 [workspace.metadata.dist]
diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index 21b4a897..5dc2b1e6 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -290,6 +290,15 @@ pub struct Generate {
     /// top_p - The probability for the top tokens are added until the result is greater or equal to P and at least min_keep tokens have been seen.
     ///   p(0.95): The cumulative probability after which no more tokens are kept for sampling.
     ///   min_keep(1): Minimum tokens to keep. Setting this to 0 is not recommended.
+    ///
+    /// top_a (default: disabled) - This sampler prunes tokens that don't meet a threshold based on the most probable token. The formula is `a1 * pow(max_prob, a2)`. See https://github.com/BlinkDL/RWKV-LM#the-top-a-sampling-method for more information.
+    ///   a1(0.0): Threshold scale. A reasonable value is 0.2. Setting either a1 or a2 to 0 disables the sampler.
+    ///   a2(0.0): Threshold power. A reasonable value is 2.
+    ///   min_keep(1): Minimum tokens to keep. Setting this to 0 is not recommended.
+    ///
+    /// min_p (default: disabled) - This sampler prunes tokens that don't meet a certain percentage of the most probable token. For example if `p` is `0.05` then after `min_keep` is satisfied, other tokens must be at least 5% of the most probable token. See https://github.com/ggerganov/llama.cpp/issues/3483 for more information.
+    ///   p(0.0): Probability threshold. 0.05 to 0.2 are good starting values to try. Setting this to 0 disables the sampler.
+    ///   min_keep(1): Minimum tokens to keep. Setting this to 0 is not recommended.
     #[arg(long = "sampler", short = 's', verbatim_doc_comment)]
     pub sampler_options: Vec<String>,
 
diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs
index a9ace889..3666167e 100644
--- a/binaries/llm-test/src/inference.rs
+++ b/binaries/llm-test/src/inference.rs
@@ -92,14 +92,14 @@ fn run_inference(
 // Takes the most likely element from the logits, except if they've appeared in `previous_tokens`
 // at all
 #[derive(Debug, Default)]
-struct DeterministicSampler(SampleGreedy<TokenId>);
+struct DeterministicSampler(SampleGreedy);
 
-impl Sampler<TokenId, f32> for DeterministicSampler {
+impl Sampler for DeterministicSampler {
     fn sample<'a>(
         &mut self,
-        res: &mut dyn HasSamplerResources<TokenId = TokenId>,
-        logits: &'a mut Logits<TokenId, f32>,
-    ) -> anyhow::Result<&'a mut Logits<TokenId, f32>> {
+        res: &mut dyn HasSamplerResources,
+        logits: &'a mut Logits,
+    ) -> anyhow::Result<&'a mut Logits> {
         let mut flat_bias = Default::default();
 
         // This might look a little weird, but it's necessary because the resource
diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index e07c8852..f0a88a8a 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -60,7 +60,7 @@ pub struct InferenceParameters {
     /// This can be anything that implements [Sampler]. Refer to
     /// the `llm-samplers` documentation for possible samplers and suggested
     /// combinations: <https://docs.rs/llm-samplers>
-    pub sampler: Arc<Mutex<dyn Sampler<TokenId, f32>>>,
+    pub sampler: Arc<Mutex<dyn Sampler>>,
 }
 
 //Since Sampler implements Send and Sync, InferenceParameters should too.
diff --git a/crates/llm-base/src/samplers.rs b/crates/llm-base/src/samplers.rs
index 7a179f0b..f0b07b9e 100644
--- a/crates/llm-base/src/samplers.rs
+++ b/crates/llm-base/src/samplers.rs
@@ -59,7 +59,7 @@ pub enum SamplingError {
 /// to ensure a valid configuration.
 pub struct ConfiguredSamplers {
     /// A builder from the `llm-samplers` crate.
-    pub builder: SamplerChainBuilder,
+    pub builder: SamplerChainBuilder<usize, f32>,
     /// Mirostat 1 is present.
     pub mirostat1: bool,
     /// Mirostat 2 is present.
@@ -74,15 +74,17 @@ pub struct ConfiguredSamplers {
 /// We call a configuration of samplers that run in a certain order a "chain".
 /// Here is a description of the default chain `llm` uses:
 ///
-/// 1. Repetition (present by default, multiple allowed)
-/// 2. Frequency/Presence (optional, multiple allowed)
-/// 3. Sequence Repetition (optional, multiple allowed)
-/// 4. Top-K (present by default - incompatible with Mirostat)
-/// 5. Tail Free (optional - incompatible with Mirostat)
-/// 6. Locally Typical (optional - incompatible with Mirostat)
-/// 7. Top-P (present by default - incompatible with Mirostat)
-/// 8. Temperature (present by default)
-/// 9. A Mirostat 1 or 2 sampler if configured, otherwise Random Distribution.
+/// 1.  Repetition (present by default, multiple allowed)
+/// 2.  Frequency/Presence (optional, multiple allowed)
+/// 3.  Sequence Repetition (optional, multiple allowed)
+/// 4.  Top-K (present by default - incompatible with Mirostat)
+/// 5.  Tail Free (optional - incompatible with Mirostat)
+/// 6.  Locally Typical (optional - incompatible with Mirostat)
+/// 7.  Top-P (present by default - incompatible with Mirostat)
+/// 8.  Top-A (optional - incompatible with Mirostat)
+/// 9.  Min-P (optional - incompatible with Mirostat)
+/// 10. Temperature (present by default)
+/// 11. A Mirostat 1 or 2 sampler if configured, otherwise Random Distribution.
 ///
 /// Samplers listed as "present by default" but incompatible with Mirostat will
 /// only be enabled by default if there is no Mirostat sampler enabled.
@@ -142,6 +144,20 @@ impl Default for ConfiguredSamplers {
                         Option::<SampleTopP>::None,
                     ),
                 ),
+                (
+                    "topa",
+                    SamplerSlot::new_single(
+                        || Box::new(SampleTopA::default().a1(0.0).a2(0.0)),
+                        Option::<SampleTopA>::None,
+                    ),
+                ),
+                (
+                    "minp",
+                    SamplerSlot::new_single(
+                        || Box::new(SampleMinP::default().p(0.0)),
+                        Option::<SampleMinP>::None,
+                    ),
+                ),
                 (
                     "temperature",
                     SamplerSlot::new_single(
@@ -203,7 +219,7 @@ impl ConfiguredSamplers {
             ))?
         } else if (self.mirostat1 || self.mirostat2) && self.incompat_mirostat {
             Err(SamplerConfigurationError::SamplerCombinationError(
-                "Cannot enable top-p, top-k, locally typical or tail free samplers with Mirostat 1 or 2".to_string(),
+                "Cannot enable top-p, top-k, top-a, min-p, locally typical or tail free samplers with Mirostat 1 or 2".to_string(),
             ))?
         }
         Ok(())
@@ -245,7 +261,9 @@ impl FromStr for ConfiguredSamplers {
             .inspect(|(name, _slot)| match name.as_str() {
                 "mirostat1" => result.mirostat1 = true,
                 "mirostat2" => result.mirostat2 = true,
-                "topp" | "topk" | "locallytypical" | "tailfree" => result.incompat_mirostat = true,
+                "topa" | "minp" | "topp" | "topk" | "locallytypical" | "tailfree" => {
+                    result.incompat_mirostat = true
+                }
                 _ => (),
             })
             .collect::<Vec<_>>();
@@ -269,7 +287,7 @@ impl FromStr for ConfiguredSamplers {
 /// Sample a token. This convenience function handles building
 /// the sampler resources and logits objects the sampler needs.
 pub fn sample_token(
-    mut sampler: impl Sampler<TokenId, f32>,
+    mut sampler: impl Sampler,
     rng: &mut impl rand::Rng,
     previous_tokens: &[TokenId],
     last_logits: impl IntoIterator<Item = f32>,
@@ -297,7 +315,7 @@ pub fn build_sampler(
     n_vocab: usize,
     bias: &[(TokenId, f32)],
     args: &[impl AsRef<str>],
-) -> Result<Arc<Mutex<dyn Sampler<TokenId, f32>>>, SamplerConfigurationError> {
+) -> Result<Arc<Mutex<dyn Sampler>>, SamplerConfigurationError> {
     let mut samplers = SamplerChain::new();
 
     if !bias.is_empty() {
@@ -326,7 +344,7 @@ pub fn build_sampler(
 }
 
 /// Get the default sampler chain.
-pub fn default_samplers() -> Arc<Mutex<dyn Sampler<TokenId, f32>>> {
+pub fn default_samplers() -> Arc<Mutex<dyn Sampler>> {
     let mut result = ConfiguredSamplers::default();
     result.ensure_default_slots();
     Arc::new(Mutex::new(result.builder.into_chain()))
@@ -349,8 +367,6 @@ impl<'pt, 'r> fmt::Debug for SamplerResources<'pt, 'r> {
 }
 
 impl<'pt, 'r> HasSamplerResources for SamplerResources<'pt, 'r> {
-    type TokenId = TokenId;
-
     fn with_rng_mut(
         &mut self,
         fun: &mut dyn FnMut(&mut dyn rand::RngCore),
@@ -359,7 +375,7 @@ impl<'pt, 'r> HasSamplerResources for SamplerResources<'pt, 'r> {
         Ok(())
     }
 
-    fn with_last_tokens(&self, fun: &mut dyn FnMut(&[Self::TokenId])) -> Result<(), SamplerError> {
+    fn with_last_tokens(&self, fun: &mut dyn FnMut(&[TokenId])) -> Result<(), SamplerError> {
         fun(self.previous_tokens);
         Ok(())
     }

From 9df5a7e48cfa2c6df81958982efcdbedd21bbd74 Mon Sep 17 00:00:00 2001
From: KerfuffleV2 <kerfliffle@keemail.me>
Date: Thu, 9 Nov 2023 01:31:09 -0700
Subject: [PATCH 18/22] Depend on llm-samplers 0.0.7 release

---
 Cargo.lock                | 3 ++-
 Cargo.toml                | 3 +--
 crates/ggml/sys/llama-cpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e2d26e79..c3b959a5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1375,7 +1375,8 @@ dependencies = [
 [[package]]
 name = "llm-samplers"
 version = "0.0.7"
-source = "git+https://github.com/KerfuffleV2/llm-samplers?branch=feat-v0.0.7#8c72d0c2838471bfbe26394694b41054bd789549"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e85df656cd89e7702cb56171d75aa77c7bec828af7d2054d9987c34411cf896"
 dependencies = [
  "anyhow",
  "num-traits",
diff --git a/Cargo.toml b/Cargo.toml
index 2daf8d62..2787805e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -32,8 +32,7 @@ clap = { version = "4.1.8", features = ["derive"] }
 memmap2 = "0.5.10"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 tracing = { version = "0.1", features = ["log"] }
-llm-samplers = { git = "https://github.com/KerfuffleV2/llm-samplers", branch = "feat-v0.0.7" }
-# llm-samplers = "=0.0.6"
+llm-samplers = "=0.0.7"
 
 # Config for 'cargo dist'
 [workspace.metadata.dist]
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index 8183159c..1a941869 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit 8183159cf3def112f6d1fe94815fce70e1bffa12
+Subproject commit 1a941869cbef8e9cc351a6c6987e4ae3b0f021f7

From 23c3047cba4a41c53a2ffbd9b5b9e6994aeb7d67 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 12 Nov 2023 19:18:19 +0100
Subject: [PATCH 19/22] fix(ggml): don't use Neon on macOS aarch64

This code doesn't feel right. The original order pre-#426 should work -
why would mcpu=native not work in that case?

I think this is something that will need revisiting.
---
 crates/ggml/sys/build.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs
index 736fa156..b2921d47 100644
--- a/crates/ggml/sys/build.rs
+++ b/crates/ggml/sys/build.rs
@@ -77,9 +77,9 @@ fn main() {
             if compiler.is_like_clang() || compiler.is_like_gnu() {
                 if target_os == "macos" {
                     build.flag("-mcpu=apple-m1");
-                    build.flag("-mfpu=neon");
                 } else if std::env::var("HOST") == std::env::var("TARGET") {
                     build.flag("-mcpu=native");
+                    build.flag("-mfpu=neon");
                 }
                 build.flag("-pthread");
             }

From b4ca9245c8057a60bec01ad08e41cbac7838c07f Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 12 Nov 2023 19:27:27 +0100
Subject: [PATCH 20/22] chore: update vulnerable deps

---
 Cargo.lock | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c3b959a5..d3e34d5f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -191,9 +191,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.3.3"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
 
 [[package]]
 name = "block-buffer"
@@ -755,7 +755,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef033ed5e9bad94e55838ca0ca906db0e043f517adda0c8b79c7a8c66c93c1b5"
 dependencies = [
  "cfg-if",
- "rustix 0.38.1",
+ "rustix 0.38.13",
  "windows-sys 0.48.0",
 ]
 
@@ -1169,7 +1169,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24fddda5af7e54bf7da53067d6e802dbcc381d0a8eef629df528e3ebf68755cb"
 dependencies = [
  "hermit-abi 0.3.1",
- "rustix 0.38.1",
+ "rustix 0.38.13",
  "windows-sys 0.48.0",
 ]
 
@@ -1251,9 +1251,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.3"
+version = "0.4.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09fc20d2ca12cb9f044c93e3bd6d32d523e6e2ec3db4f7b2939cd99026ecd3f0"
+checksum = "969488b55f8ac402214f3f5fd243ebb7206cf82de60d3172994707a4bcc2b829"
 
 [[package]]
 name = "llm"
@@ -2015,9 +2015,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustix"
-version = "0.37.21"
+version = "0.37.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62f25693a73057a1b4cb56179dd3c7ea21a7c6c5ee7d85781f5749b46f34b79c"
+checksum = "fea8ca367a3a01fe35e6943c400addf443c0f57670e6ec51196f71a4b8762dd2"
 dependencies = [
  "bitflags 1.3.2",
  "errno",
@@ -2029,14 +2029,14 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.1"
+version = "0.38.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fbc6396159432b5c8490d4e301d8c705f61860b8b6c863bf79942ce5401968f3"
+checksum = "d7db8590df6dfcd144d22afd1b83b36c21a18d7cbc1dc4bb5295a8712e9eb662"
 dependencies = [
- "bitflags 2.3.3",
+ "bitflags 2.4.1",
  "errno",
  "libc",
- "linux-raw-sys 0.4.3",
+ "linux-raw-sys 0.4.11",
  "windows-sys 0.48.0",
 ]
 
@@ -2344,7 +2344,7 @@ dependencies = [
  "cfg-if",
  "fastrand",
  "redox_syscall 0.3.5",
- "rustix 0.37.21",
+ "rustix 0.37.27",
  "windows-sys 0.48.0",
 ]
 

From 8a3aeecd4e58b96850528bff3e6c41baa71ae6d9 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 12 Nov 2023 19:54:07 +0100
Subject: [PATCH 21/22] chore: update rustix 0.38

This required bumping other dependencies and hacking up the Cargo.lock.
---
 Cargo.lock                          | 70 +++++++++++------------------
 Cargo.toml                          | 11 +++--
 binaries/llm-cli/src/cli_args.rs    |  6 +--
 binaries/llm-cli/src/interactive.rs |  2 +-
 4 files changed, 37 insertions(+), 52 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d3e34d5f..629013ae 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.20.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
 dependencies = [
  "gimli",
 ]
@@ -101,17 +101,6 @@ version = "1.0.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
 
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi 0.1.19",
- "libc",
- "winapi",
-]
-
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -120,9 +109,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "backtrace"
-version = "0.3.68"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
 dependencies = [
  "addr2line",
  "cc",
@@ -404,13 +393,13 @@ checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
 
 [[package]]
 name = "colored"
-version = "2.0.0"
+version = "2.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd"
+checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6"
 dependencies = [
- "atty",
+ "is-terminal",
  "lazy_static",
- "winapi",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -755,7 +744,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ef033ed5e9bad94e55838ca0ca906db0e043f517adda0c8b79c7a8c66c93c1b5"
 dependencies = [
  "cfg-if",
- "rustix 0.38.13",
+ "rustix 0.38.19",
  "windows-sys 0.48.0",
 ]
 
@@ -932,9 +921,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.27.3"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e"
+checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
 
 [[package]]
 name = "glob"
@@ -982,15 +971,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 
-[[package]]
-name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "hermit-abi"
 version = "0.3.1"
@@ -1151,7 +1131,7 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
  "libc",
  "windows-sys 0.48.0",
 ]
@@ -1164,12 +1144,12 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
 
 [[package]]
 name = "is-terminal"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24fddda5af7e54bf7da53067d6e802dbcc381d0a8eef629df528e3ebf68755cb"
+checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
 dependencies = [
- "hermit-abi 0.3.1",
- "rustix 0.38.13",
+ "hermit-abi",
+ "rustix 0.38.19",
  "windows-sys 0.48.0",
 ]
 
@@ -1229,9 +1209,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.147"
+version = "0.2.150"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
 
 [[package]]
 name = "libloading"
@@ -1601,7 +1581,7 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
  "libc",
 ]
 
@@ -1613,9 +1593,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
 
 [[package]]
 name = "object"
-version = "0.31.1"
+version = "0.32.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1"
+checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0"
 dependencies = [
  "memchr",
 ]
@@ -2029,9 +2009,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.13"
+version = "0.38.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7db8590df6dfcd144d22afd1b83b36c21a18d7cbc1dc4bb5295a8712e9eb662"
+checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed"
 dependencies = [
  "bitflags 2.4.1",
  "errno",
@@ -2256,9 +2236,9 @@ dependencies = [
 
 [[package]]
 name = "spinoff"
-version = "0.7.0"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fee259f96b31e7a18657d11741fe30d63f98e07de70e7a19d2b705ab9b331cdc"
+checksum = "20aa2ed67fbb202e7b716ff8bfc6571dd9301617767380197d701c31124e88f6"
 dependencies = [
  "colored",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index 2787805e..045ecc9e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
     "crates/llm",
     "crates/llm-base",
     "crates/models/*",
-    "binaries/*"
+    "binaries/*",
 ]
 resolver = "2"
 default-members = ["binaries/llm-cli", "crates/llm"]
@@ -27,7 +27,7 @@ anyhow = "1.0"
 rustyline = { version = "11.0.0", features = ["derive"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = { version = "1.0" }
-spinoff = { version = "0.7.0", default-features = false, features = ["dots2"] }
+spinoff = { version = "0.8.0", default-features = false, features = ["dots2"] }
 clap = { version = "4.1.8", features = ["derive"] }
 memmap2 = "0.5.10"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
@@ -45,7 +45,12 @@ ci = ["github"]
 # The installers to generate for each app
 installers = ["shell", "powershell"]
 # Target platforms to build apps for (Rust target-triple syntax)
-targets = ["x86_64-unknown-linux-gnu", "x86_64-apple-darwin", "x86_64-pc-windows-msvc", "aarch64-apple-darwin"]
+targets = [
+    "x86_64-unknown-linux-gnu",
+    "x86_64-apple-darwin",
+    "x86_64-pc-windows-msvc",
+    "aarch64-apple-darwin",
+]
 
 # The profile that 'cargo dist' will build with
 [profile.dist]
diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index 5dc2b1e6..e158db68 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -542,7 +542,7 @@ impl ModelLoad {
         let tokenizer_source = match self.model_and_tokenizer.to_source() {
             Ok(vs) => vs,
             Err(err) => {
-                if let Some(sp) = sp.take() {
+                if let Some(mut sp) = sp.take() {
                     sp.fail(&format!("Failed to load tokenizer: {}", err));
                 }
                 return Err(err);
@@ -595,7 +595,7 @@ impl ModelLoad {
                     file_size,
                     tensor_count,
                 } => {
-                    if let Some(sp) = sp.take() {
+                    if let Some(mut sp) = sp.take() {
                         sp.success(&format!(
                             "Loaded {tensor_count} tensors ({}) after {}ms",
                             bytesize::to_string(file_size, false),
@@ -610,7 +610,7 @@ impl ModelLoad {
         if model.is_err() {
             // If we've failed at loading the model, we probably haven't stopped the spinner yet.
             // Cancel it now if needed.
-            if let Some(sp) = sp {
+            if let Some(mut sp) = sp {
                 sp.fail("Failed to load model")
             }
         }
diff --git a/binaries/llm-cli/src/interactive.rs b/binaries/llm-cli/src/interactive.rs
index 4657bc9d..3ad7e486 100644
--- a/binaries/llm-cli/src/interactive.rs
+++ b/binaries/llm-cli/src/interactive.rs
@@ -141,7 +141,7 @@ fn feed_prompt_with_spinner(
         prompt.insert(0, '\n');
     }
 
-    let sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None);
+    let mut sp = spinoff::Spinner::new(spinoff::spinners::Dots2, "".to_string(), None);
     let result = session.feed_prompt(
         model,
         &prompt,

From eddf95309e224fed7375d54aae66e1e0b4395a8e Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 12 Nov 2023 22:28:28 +0100
Subject: [PATCH 22/22] chore: fix precommit

---
 binaries/generate-ggml-bindings/src/main.rs | 4 ++--
 crates/ggml/sys/src/metal.rs                | 4 ++--
 crates/llm-base/src/loader.rs               | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/binaries/generate-ggml-bindings/src/main.rs b/binaries/generate-ggml-bindings/src/main.rs
index ad73ddba..1878f471 100644
--- a/binaries/generate-ggml-bindings/src/main.rs
+++ b/binaries/generate-ggml-bindings/src/main.rs
@@ -90,9 +90,9 @@ fn generate_metal(ggml_path: &Path, src_path: &Path) {
     generate_extra("metal", ggml_path, src_path, |b| {
         b.header(ggml_path.join("ggml-metal.h").to_string_lossy())
             .allowlist_file(r".*ggml-metal\.h")
-            .raw_line("use super::ggml_tensor;")
-            .raw_line("use super::ggml_log_callback;")
             .raw_line("use super::ggml_cgraph;")
+            .raw_line("use super::ggml_log_callback;")
+            .raw_line("use super::ggml_tensor;")
     });
 }
 
diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
index e2d7c621..464db3ce 100644
--- a/crates/ggml/sys/src/metal.rs
+++ b/crates/ggml/sys/src/metal.rs
@@ -1,8 +1,8 @@
 /* automatically generated by rust-bindgen 0.65.1 */
 
-use super::ggml_tensor;
-use super::ggml_log_callback;
 use super::ggml_cgraph;
+use super::ggml_log_callback;
+use super::ggml_tensor;
 
 pub const GGML_METAL_MAX_BUFFERS: u32 = 16;
 pub const GGML_METAL_MAX_COMMAND_BUFFERS: u32 = 32;
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index 2e80495c..f00b2974 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -40,12 +40,12 @@ impl TryFrom<u32> for FileType {
 
     fn try_from(value: u32) -> Result<Self, Self::Error> {
         let format = FileTypeFormat::try_from(
-            ((value as u32) % ggml::QNT_VERSION_FACTOR) as ggml::sys::llama::llama_ftype,
+            (value % ggml::QNT_VERSION_FACTOR) as ggml::sys::llama::llama_ftype,
         )?;
 
         Ok(Self {
             format,
-            quantization_version: (value as u32) / ggml::QNT_VERSION_FACTOR,
+            quantization_version: value / ggml::QNT_VERSION_FACTOR,
         })
     }
 }