Skip to content

Commit

Permalink
Quantized moondream implementation and BOS token (huggingface#1980)
Browse files Browse the repository at this point in the history
* moondream implementation

* add moondream example

* change config default activation

* Add assets and integrate phi mixformer with example

* Make use of kv cache and fix seq_len bug; Clean up example code

* Add README link to example

* Remove pos_embed scaling; Remove assets; Add to README; Expand VisionConfig

* Delete image

* Use apply instead of forward

* Pass bos token at the beginning of tensor.

* Quantize moondream.

* Forward with image bos token.

* Clippy.

* Use q4_0 quantization.

* Add pointers for sequence and tokens; Remove seq_len conditional
  • Loading branch information
santiagomed committed Apr 1, 2024
1 parent 308ea07 commit ea0d8d3
Show file tree
Hide file tree
Showing 6 changed files with 393 additions and 32 deletions.
93 changes: 77 additions & 16 deletions candle-examples/examples/moondream/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,19 @@ use clap::Parser;

use candle::{DType, Device, Tensor};
use candle_nn::VarBuilder;
use candle_transformers::{generation::LogitsProcessor, models::moondream};
use candle_transformers::{
generation::LogitsProcessor,
models::{moondream, quantized_moondream},
};
use tokenizers::Tokenizer;

enum Model {
Moondream(moondream::Model),
Quantized(quantized_moondream::Model),
}

struct TextGeneration {
model: moondream::Model,
model: Model,
device: Device,
tokenizer: Tokenizer,
logits_processor: LogitsProcessor,
Expand All @@ -25,7 +33,7 @@ struct TextGeneration {
impl TextGeneration {
#[allow(clippy::too_many_arguments)]
fn new(
model: moondream::Model,
model: Model,
tokenizer: Tokenizer,
seed: u64,
temp: Option<f64>,
Expand Down Expand Up @@ -64,6 +72,14 @@ impl TextGeneration {
let mut tokens = tokens.get_ids().to_vec();
let mut generated_tokens = 0usize;

// Moondream tokenizer bos_token is "<|endoftext|>"
// https://huggingface.co/vikhyatk/moondream2/blob/main/special_tokens_map.json
let bos_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") {
Some(token) => *token,
None => anyhow::bail!("cannot find the BOS token"),
};
// eos_token is "END"
// https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L100
let eos_token = match self.tokenizer.get_vocab(true).get("END") {
Some(token) => *token,
None => anyhow::bail!("cannot find the EOS token"),
Expand All @@ -75,11 +91,24 @@ impl TextGeneration {
let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
let logits = if index > 0 {
self.model.text_model.forward(&input)?
match self.model {
Model::Moondream(ref mut model) => model.text_model.forward(&input)?,
Model::Quantized(ref mut model) => model.text_model.forward(&input)?,
}
} else {
self.model
.text_model
.forward_with_img(&input, image_embeds)?
let bos_token = Tensor::new(&[bos_token], &self.device)?.unsqueeze(0)?;
match self.model {
Model::Moondream(ref mut model) => {
model
.text_model
.forward_with_img(&bos_token, &input, image_embeds)?
}
Model::Quantized(ref mut model) => {
model
.text_model
.forward_with_img(&bos_token, &input, image_embeds)?
}
}
};
let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
let logits = if self.repeat_penalty == 1. {
Expand Down Expand Up @@ -142,7 +171,7 @@ struct Args {
top_p: Option<f64>,

/// The seed to use when generating random samples.
#[arg(long, default_value_t = 299792458)]
#[arg(long, default_value_t = 0)]
seed: u64,

#[arg(long, default_value_t = 5000)]
Expand All @@ -156,12 +185,15 @@ struct Args {
#[arg(long, default_value_t = 64)]
repeat_last_n: usize,

#[arg(long, default_value = "vikhyatk/moondream2")]
model_id: String,
#[arg(long)]
model_id: Option<String>,

#[arg(long, default_value = "main")]
revision: String,

#[arg(long)]
quantized: bool,

#[arg(long)]
model_file: Option<String>,

Expand Down Expand Up @@ -216,14 +248,30 @@ async fn main() -> anyhow::Result<()> {

let start = std::time::Instant::now();
let api = hf_hub::api::tokio::Api::new()?;
let model_id = match args.model_id {
Some(model_id) => model_id.to_string(),
None => {
if args.quantized {
"santiagomed/candle-moondream".to_string()
} else {
"vikhyatk/moondream2".to_string()
}
}
};
let repo = api.repo(hf_hub::Repo::with_revision(
args.model_id,
model_id,
hf_hub::RepoType::Model,
args.revision,
));
let model_file = match args.model_file {
Some(m) => m.into(),
None => repo.get("model.safetensors").await?,
None => {
if args.quantized {
repo.get("model-q4_0.gguf").await?
} else {
repo.get("model.safetensors").await?
}
}
};
let tokenizer = match args.tokenizer_file {
Some(m) => m.into(),
Expand All @@ -234,22 +282,35 @@ async fn main() -> anyhow::Result<()> {

let start = std::time::Instant::now();
let device = candle_examples::device(args.cpu)?;
let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
let config = moondream::Config::v2();
let model = moondream::Model::new(&config, vb)?;
let model = if args.quantized {
let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
&model_file,
&device,
)?;
let model = quantized_moondream::Model::new(&config, vb)?;
Model::Quantized(model)
} else {
let vb =
unsafe { VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? };
let model = moondream::Model::new(&config, vb)?;
Model::Moondream(model)
};
println!("loaded the model in {:?}", start.elapsed());

let start = std::time::Instant::now();
let image = load_image(args.image)?.to_device(&device)?;
let image_embeds = image.unsqueeze(0)?;
let image_embeds = image_embeds.apply(model.vision_encoder())?;
let image_embeds = match model {
Model::Moondream(ref m) => image_embeds.apply(m.vision_encoder())?,
Model::Quantized(ref m) => image_embeds.apply(m.vision_encoder())?,
};
println!(
"loaded and encoded the image {image:?} in {:?}",
start.elapsed()
);

let prompt = format!("\n\nQuestion: {0}\n\nAnswer:", args.prompt);

let mut pipeline = TextGeneration::new(
model,
tokenizer,
Expand Down
18 changes: 11 additions & 7 deletions candle-transformers/src/models/mixformer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -438,16 +438,20 @@ impl MixFormerSequentialForCausalLM {
xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1)
}

pub fn forward_with_img(&mut self, xs: &Tensor, img_embeds: &Tensor) -> Result<Tensor> {
pub fn forward_with_img(
&mut self,
bos_token: &Tensor,
xs: &Tensor,
img_embeds: &Tensor,
) -> Result<Tensor> {
let _enter = self.span.enter();
let xs = xs.apply(&self.embedding)?;
let mut xs = Tensor::cat(&[img_embeds.clone(), xs], 1)?;
let bos_token = bos_token.apply(&self.embedding)?;
// Python implementation sequence order is <bos token embedding><img embedding><rest of text embedding>
// https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L43-L56
let mut xs = Tensor::cat(&[bos_token, img_embeds.clone(), xs], 1)?;
let (_b_size, seq_len, _embds) = xs.dims3()?;
let mask = if seq_len <= 1 {
None
} else {
Some(get_mask(seq_len, xs.device())?)
};
let mask = Some(get_mask(seq_len, xs.device())?);
for block in self.blocks.iter_mut() {
xs = block.forward(&xs, mask.as_ref())?
}
Expand Down
1 change: 1 addition & 0 deletions candle-transformers/src/models/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub mod quantized_llama2_c;
pub mod quantized_metavoice;
pub mod quantized_mistral;
pub mod quantized_mixformer;
pub mod quantized_moondream;
pub mod quantized_mpt;
pub mod quantized_rwkv_v5;
pub mod quantized_rwkv_v6;
Expand Down
18 changes: 9 additions & 9 deletions candle-transformers/src/models/moondream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@ fn scaled_dot_product_attention(q: &Tensor, k: &Tensor, v: &Tensor) -> Result<Te

#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
pub struct VisionConfig {
image_embedding_dim: usize,
model_dim: usize,
hidden_dim: usize,
hidden_features: usize,
embed_len: usize,
embed_dim: usize,
num_blocks: usize,
num_heads: usize,
act: candle_nn::Activation,
pub(crate) image_embedding_dim: usize,
pub(crate) model_dim: usize,
pub(crate) hidden_dim: usize,
pub(crate) hidden_features: usize,
pub(crate) embed_len: usize,
pub(crate) embed_dim: usize,
pub(crate) num_blocks: usize,
pub(crate) num_heads: usize,
pub(crate) act: candle_nn::Activation,
}

impl VisionConfig {
Expand Down
24 changes: 24 additions & 0 deletions candle-transformers/src/models/quantized_mixformer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,30 @@ impl MixFormerSequentialForCausalLM {
xs.narrow(1, seq_len - 1, 1)?.apply(&self.head)?.squeeze(1)
}

pub fn forward_with_img(
&mut self,
bos_token: &Tensor,
xs: &Tensor,
img_embeds: &Tensor,
) -> Result<Tensor> {
let _enter = self.span.enter();
let xs = xs.apply(&self.embedding)?;
let bos_token = bos_token.apply(&self.embedding)?;
// Python implementation sequence order is <bos token embedding><img embedding><rest of text embedding>
// https://github.com/vikhyat/moondream/blob/a9d788a20d1543fb1479edc54106e88cff7759d3/moondream/moondream.py#L43-L56
let mut xs = Tensor::cat(&[bos_token, img_embeds.clone(), xs], 1)?;
let (_b_size, seq_len, _embds) = xs.dims3()?;
let mask = Some(get_mask(seq_len, xs.device())?);
for block in self.blocks.iter_mut() {
xs = block.forward(&xs, mask.as_ref())?
}
let xs = xs
.narrow(1, seq_len - 1, 1)?
.apply(&self.head)?
.squeeze(1)?;
Ok(xs)
}

pub fn clear_kv_cache(&mut self) {
self.blocks.iter_mut().for_each(|b| b.clear_kv_cache())
}
Expand Down
Loading

0 comments on commit ea0d8d3

Please sign in to comment.