Quantized moondream implementation and BOS token (huggingface#1980)

* moondream implementation * add moondream example * change config default activation * Add assets and integrate phi mixformer with example * Make use of kv cache and fix seq_len bug; Clean up example code * Add README link to example * Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig * Delete image * Use apply instead of forward * Pass bos token at the beginning of tensor. * Quantize moondream. * Forward with image bos token. * Clippy. * Use q4_0 quantization. * Add pointers for sequence and tokens; Remove seq_len conditional
getong · Apr 1, 2024 · ea0d8d3 · ea0d8d3
1 parent 308ea07
commit ea0d8d3
Show file tree

Hide file tree

Showing 6 changed files with 393 additions and 32 deletions.
diff --git a/candle-examples/examples/moondream/main.rs b/candle-examples/examples/moondream/main.rs
@@ -9,11 +9,19 @@ use clap::Parser;
 
 use candle::{DType, Device, Tensor};
 use candle_nn::VarBuilder;
-use candle_transformers::{generation::LogitsProcessor, models::moondream};
+use candle_transformers::{
+    generation::LogitsProcessor,
+    models::{moondream, quantized_moondream},
+};
 use tokenizers::Tokenizer;
 
+enum Model {
+    Moondream(moondream::Model),
+    Quantized(quantized_moondream::Model),
+}
+
 struct TextGeneration {
-    model: moondream::Model,
+    model: Model,
     device: Device,
     tokenizer: Tokenizer,
     logits_processor: LogitsProcessor,
@@ -25,7 +33,7 @@ struct TextGeneration {
 impl TextGeneration {
     #[allow(clippy::too_many_arguments)]
     fn new(
-        model: moondream::Model,
+        model: Model,
         tokenizer: Tokenizer,
         seed: u64,
         temp: Option<f64>,
@@ -64,6 +72,14 @@ impl TextGeneration {
         let mut tokens = tokens.get_ids().to_vec();
         let mut generated_tokens = 0usize;
 
+        // Moondream tokenizer bos_token is "<|endoftext|>"
+        // https://huggingface.co/vikhyatk/moondream2/blob/main/special_tokens_map.json
+        let bos_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") {
+            Some(token) => *token,
+            None => anyhow::bail!("cannot find the BOS token"),
+        };
+        // eos_token is "END"
+        // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L100
         let eos_token = match self.tokenizer.get_vocab(true).get("END") {
             Some(token) => *token,
             None => anyhow::bail!("cannot find the EOS token"),
@@ -75,11 +91,24 @@ impl TextGeneration {
             let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
             let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
             let logits = if index > 0 {
-                self.model.text_model.forward(&input)?
+                match self.model {
+                    Model::Moondream(ref mut model) => model.text_model.forward(&input)?,
+                    Model::Quantized(ref mut model) => model.text_model.forward(&input)?,
+                }
             } else {
-                self.model
-                    .text_model
-                    .forward_with_img(&input, image_embeds)?
+                let bos_token = Tensor::new(&[bos_token], &self.device)?.unsqueeze(0)?;
+                match self.model {
+                    Model::Moondream(ref mut model) => {
+                        model
+                            .text_model
+                            .forward_with_img(&bos_token, &input, image_embeds)?
+                    }
+                    Model::Quantized(ref mut model) => {
+                        model
+                            .text_model
+                            .forward_with_img(&bos_token, &input, image_embeds)?
+                    }
+                }
             };
             let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
             let logits = if self.repeat_penalty == 1. {
@@ -142,7 +171,7 @@ struct Args {
     top_p: Option<f64>,
 
     /// The seed to use when generating random samples.
-    #[arg(long, default_value_t = 299792458)]
+    #[arg(long, default_value_t = 0)]
     seed: u64,
 
     #[arg(long, default_value_t = 5000)]
@@ -156,12 +185,15 @@ struct Args {
     #[arg(long, default_value_t = 64)]
     repeat_last_n: usize,
 
-    #[arg(long, default_value = "vikhyatk/moondream2")]
-    model_id: String,
+    #[arg(long)]
+    model_id: Option<String>,
 
     #[arg(long, default_value = "main")]
     revision: String,
 
+    #[arg(long)]
+    quantized: bool,
+
     #[arg(long)]
     model_file: Option<String>,
 
@@ -216,14 +248,30 @@ async fn main() -> anyhow::Result<()> {
 
     let start = std::time::Instant::now();
     let api = hf_hub::api::tokio::Api::new()?;
+    let model_id = match args.model_id {
+        Some(model_id) => model_id.to_string(),
+        None => {
+            if args.quantized {
+                "santiagomed/candle-moondream".to_string()
+            } else {
+                "vikhyatk/moondream2".to_string()
+            }
+        }
+    };
     let repo = api.repo(hf_hub::Repo::with_revision(
-        args.model_id,
+        model_id,
         hf_hub::RepoType::Model,
         args.revision,
     ));
     let model_file = match args.model_file {
         Some(m) => m.into(),
-        None => repo.get("model.safetensors").await?,
+        None => {
+            if args.quantized {
+                repo.get("model-q4_0.gguf").await?
+            } else {
+                repo.get("model.safetensors").await?
+            }
+        }
     };
     let tokenizer = match args.tokenizer_file {
         Some(m) => m.into(),
@@ -234,22 +282,35 @@ async fn main() -> anyhow::Result<()> {
 
     let start = std::time::Instant::now();
     let device = candle_examples::device(args.cpu)?;
-    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
     let config = moondream::Config::v2();
-    let model = moondream::Model::new(&config, vb)?;
+    let model = if args.quantized {
+        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
+            &model_file,
+            &device,
+        )?;
+        let model = quantized_moondream::Model::new(&config, vb)?;
+        Model::Quantized(model)
+    } else {
+        let vb =
+            unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
+        let model = moondream::Model::new(&config, vb)?;
+        Model::Moondream(model)
+    };
     println!("loaded the model in {:?}", start.elapsed());
 
     let start = std::time::Instant::now();
     let image = load_image(args.image)?.to_device(&device)?;
     let image_embeds = image.unsqueeze(0)?;
-    let image_embeds = image_embeds.apply(model.vision_encoder())?;
+    let image_embeds = match model {
+        Model::Moondream(ref m) => image_embeds.apply(m.vision_encoder())?,
+        Model::Quantized(ref m) => image_embeds.apply(m.vision_encoder())?,
+    };
     println!(
         "loaded and encoded the image {image:?} in {:?}",
         start.elapsed()
     );
 
     let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt);
-
     let mut pipeline = TextGeneration::new(
         model,
         tokenizer,

diff --git a/candle-transformers/src/models/mixformer.rs b/candle-transformers/src/models/mixformer.rs
@@ -438,16 +438,20 @@ impl MixFormerSequentialForCausalLM {
         xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1)
     }
 
-    pub fn forward_with_img(&mut self, xs: &Tensor, img_embeds: &Tensor) -> Result<Tensor> {
+    pub fn forward_with_img(
+        &mut self,
+        bos_token: &Tensor,
+        xs: &Tensor,
+        img_embeds: &Tensor,
+    ) -> Result<Tensor> {
         let _enter = self.span.enter();
         let xs = xs.apply(&self.embedding)?;
-        let mut xs = Tensor::cat(&[img_embeds.clone(), xs], 1)?;
+        let bos_token = bos_token.apply(&self.embedding)?;
+        // Python implementation sequence order is <bos token embedding><img embedding><rest of text embedding>
+        // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L43-L56
+        let mut xs = Tensor::cat(&[bos_token, img_embeds.clone(), xs], 1)?;
         let (_b_size, seq_len, _embds) = xs.dims3()?;
-        let mask = if seq_len <= 1 {
-            None
-        } else {
-            Some(get_mask(seq_len, xs.device())?)
-        };
+        let mask = Some(get_mask(seq_len, xs.device())?);
         for block in self.blocks.iter_mut() {
             xs = block.forward(&xs, mask.as_ref())?
         }

diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs
@@ -35,6 +35,7 @@ pub mod quantized_llama2_c;
 pub mod quantized_metavoice;
 pub mod quantized_mistral;
 pub mod quantized_mixformer;
+pub mod quantized_moondream;
 pub mod quantized_mpt;
 pub mod quantized_rwkv_v5;
 pub mod quantized_rwkv_v6;

diff --git a/candle-transformers/src/models/moondream.rs b/candle-transformers/src/models/moondream.rs
@@ -25,15 +25,15 @@ fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result<Te
 
 #[derive(Debug, Clone, PartialEq, serde::Deserialize)]
 pub struct VisionConfig {
-    image_embedding_dim: usize,
-    model_dim: usize,
-    hidden_dim: usize,
-    hidden_features: usize,
-    embed_len: usize,
-    embed_dim: usize,
-    num_blocks: usize,
-    num_heads: usize,
-    act: candle_nn::Activation,
+    pub(crate) image_embedding_dim: usize,
+    pub(crate) model_dim: usize,
+    pub(crate) hidden_dim: usize,
+    pub(crate) hidden_features: usize,
+    pub(crate) embed_len: usize,
+    pub(crate) embed_dim: usize,
+    pub(crate) num_blocks: usize,
+    pub(crate) num_heads: usize,
+    pub(crate) act: candle_nn::Activation,
 }
 
 impl VisionConfig {

diff --git a/candle-transformers/src/models/quantized_mixformer.rs b/candle-transformers/src/models/quantized_mixformer.rs
@@ -337,6 +337,30 @@ impl MixFormerSequentialForCausalLM {
         xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1)
     }
 
+    pub fn forward_with_img(
+        &mut self,
+        bos_token: &Tensor,
+        xs: &Tensor,
+        img_embeds: &Tensor,
+    ) -> Result<Tensor> {
+        let _enter = self.span.enter();
+        let xs = xs.apply(&self.embedding)?;
+        let bos_token = bos_token.apply(&self.embedding)?;
+        // Python implementation sequence order is <bos token embedding><img embedding><rest of text embedding>
+        // https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L43-L56
+        let mut xs = Tensor::cat(&[bos_token, img_embeds.clone(), xs], 1)?;
+        let (_b_size, seq_len, _embds) = xs.dims3()?;
+        let mask = Some(get_mask(seq_len, xs.device())?);
+        for block in self.blocks.iter_mut() {
+            xs = block.forward(&xs, mask.as_ref())?
+        }
+        let xs = xs
+            .narrow(1, seq_len - 1, 1)?
+            .apply(&self.head)?
+            .squeeze(1)?;
+        Ok(xs)
+    }
+
     pub fn clear_kv_cache(&mut self) {
         self.blocks.iter_mut().for_each(|b| b.clear_kv_cache())
     }