Skip to content

Commit

Permalink
Add the SigLIP model. (huggingface#2515)
Browse files Browse the repository at this point in the history
* Add the SigLIP model.

* Add more to the forward pass of the vision model.

* Complete the forward pass.

* Add the siglip example.

* Fix.

* Another fix.

* Get everything in place.

* Add a readme.
  • Loading branch information
LaurentMazare authored Sep 28, 2024
1 parent 62525e8 commit 261ed65
Show file tree
Hide file tree
Showing 8 changed files with 797 additions and 54 deletions.
44 changes: 3 additions & 41 deletions candle-examples/examples/clip/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ use candle_nn::{ops::softmax, VarBuilder};
use candle_transformers::models::clip;

use tokenizers::Tokenizer;
use tracing::info;

#[derive(Parser)]
struct Args {
Expand Down Expand Up @@ -40,15 +39,12 @@ fn load_image<T: AsRef<std::path::Path>>(path: T, image_size: usize) -> anyhow::
height as u32,
image::imageops::FilterType::Triangle,
);

let img = img.to_rgb8();

let img = img.into_raw();
let img = Tensor::from_vec(img, (height, width, 3), &Device::Cpu)?
.permute((2, 0, 1))?
.to_dtype(DType::F32)?
.affine(2. / 255., -1.)?;
// .unsqueeze(0)?;
Ok(img)
}

Expand All @@ -57,24 +53,16 @@ fn load_images<T: AsRef<std::path::Path>>(
image_size: usize,
) -> anyhow::Result<Tensor> {
let mut images = vec![];

for path in paths {
let tensor = load_image(path, image_size)?;
images.push(tensor);
}

let images = Tensor::stack(&images, 0)?;

Ok(images)
}

pub fn main() -> anyhow::Result<()> {
// std::env::set_var("RUST_BACKTRACE", "full");

let args = Args::parse();

tracing_subscriber::fmt::init();

let model_file = match args.model {
None => {
let api = hf_hub::api::sync::Api::new()?;
Expand All @@ -89,57 +77,39 @@ pub fn main() -> anyhow::Result<()> {
}
Some(model) => model.into(),
};

let tokenizer = get_tokenizer(args.tokenizer)?;

let config = clip::ClipConfig::vit_base_patch32();

let device = candle_examples::device(args.cpu)?;

let vec_imgs = match args.images {
Some(imgs) => imgs,
None => vec![
"candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg".to_string(),
"candle-examples/examples/yolo-v8/assets/bike.jpg".to_string(),
],
};

// let image = load_image(args.image, config.image_size)?.to_device(&device)?;
let images = load_images(&vec_imgs, config.image_size)?.to_device(&device)?;

let vb =
unsafe { VarBuilder::from_mmaped_safetensors(&[model_file.clone()], DType::F32, &device)? };

let model = clip::ClipModel::new(vb, &config)?;

let (input_ids, vec_seq) = tokenize_sequences(args.sequences, &tokenizer, &device)?;

let (_logits_per_text, logits_per_image) = model.forward(&images, &input_ids)?;

let softmax_image = softmax(&logits_per_image, 1)?;

let softmax_image_vec = softmax_image.flatten_all()?.to_vec1::<f32>()?;

info!("softmax_image_vec: {:?}", softmax_image_vec);

println!("softmax_image_vec: {:?}", softmax_image_vec);
let probability_vec = softmax_image_vec
.iter()
.map(|v| v * 100.0)
.collect::<Vec<f32>>();

let probability_per_image = probability_vec.len() / vec_imgs.len();

for (i, img) in vec_imgs.iter().enumerate() {
let start = i * probability_per_image;
let end = start + probability_per_image;
let prob = &probability_vec[start..end];
info!("\n\nResults for image: {}\n", img);

println!("\n\nResults for image: {}\n", img);
for (i, p) in prob.iter().enumerate() {
info!("Probability: {:.4}% Text: {} ", p, vec_seq[i]);
println!("Probability: {:.4}% Text: {} ", p, vec_seq[i]);
}
}

Ok(())
}

Expand All @@ -156,7 +126,6 @@ pub fn get_tokenizer(tokenizer: Option<String>) -> anyhow::Result<Tokenizer> {
}
Some(file) => file.into(),
};

Tokenizer::from_file(tokenizer).map_err(E::msg)
}

Expand All @@ -169,7 +138,6 @@ pub fn tokenize_sequences(
.get_vocab(true)
.get("<|endoftext|>")
.ok_or(E::msg("No pad token"))?;

let vec_seq = match sequences {
Some(seq) => seq,
None => vec![
Expand All @@ -178,25 +146,19 @@ pub fn tokenize_sequences(
"a robot holding a candle".to_string(),
],
};

let mut tokens = vec![];

for seq in vec_seq.clone() {
let encoding = tokenizer.encode(seq, true).map_err(E::msg)?;
tokens.push(encoding.get_ids().to_vec());
}

let max_len = tokens.iter().map(|v| v.len()).max().unwrap_or(0);

// Pad the sequences to have the same length
for token_vec in tokens.iter_mut() {
let len_diff = max_len - token_vec.len();
if len_diff > 0 {
token_vec.extend(vec![pad_id; len_diff]);
}
}

let input_ids = Tensor::new(tokens, device)?;

Ok((input_ids, vec_seq))
}
24 changes: 24 additions & 0 deletions candle-examples/examples/siglip/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
## SigLIP

SigLIP is multi-modal text-vision model that improves over CLIP by using a sigmoid based loss,
[HuggingFace](https://huggingface.co/google/siglip-base-patch16-224).

### Running an example
```
$ cargo run --features cuda -r --example siglip -
softmax_image_vec: [2.1912122e-14, 2.3624872e-14, 1.0, 1.0, 2.4787932e-8, 3.2784535e-12]
Results for image: candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg
Probability: 0.0000% Text: a cycling race
Probability: 0.0000% Text: a photo of two cats
Probability: 100.0000% Text: a robot holding a candle
Results for image: candle-examples/examples/yolo-v8/assets/bike.jpg
Probability: 100.0000% Text: a cycling race
Probability: 0.0000% Text: a photo of two cats
Probability: 0.0000% Text: a robot holding a candle
```
153 changes: 153 additions & 0 deletions candle-examples/examples/siglip/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#[cfg(feature = "mkl")]
extern crate intel_mkl_src;

#[cfg(feature = "accelerate")]
extern crate accelerate_src;

use anyhow::Error as E;
use clap::Parser;

use candle::{DType, Device, Tensor};
use candle_nn::{ops::softmax, VarBuilder};
use candle_transformers::models::siglip;

use tokenizers::Tokenizer;

#[derive(Parser)]
struct Args {
#[arg(long)]
model: Option<String>,

#[arg(long)]
tokenizer: Option<String>,

#[arg(long, use_value_delimiter = true)]
images: Option<Vec<String>>,

#[arg(long)]
cpu: bool,

#[arg(long, use_value_delimiter = true)]
sequences: Option<Vec<String>>,
}

fn load_image<T: AsRef<std::path::Path>>(path: T, image_size: usize) -> anyhow::Result<Tensor> {
let img = image::ImageReader::open(path)?.decode()?;
let (height, width) = (image_size, image_size);
let img = img.resize_to_fill(
width as u32,
height as u32,
image::imageops::FilterType::Triangle,
);
let img = img.to_rgb8();
let img = img.into_raw();
let img = Tensor::from_vec(img, (height, width, 3), &Device::Cpu)?
.permute((2, 0, 1))?
.to_dtype(DType::F32)?
.affine(2. / 255., -1.)?;
Ok(img)
}

fn load_images<T: AsRef<std::path::Path>>(
paths: &Vec<T>,
image_size: usize,
) -> anyhow::Result<Tensor> {
let mut images = vec![];
for path in paths {
let tensor = load_image(path, image_size)?;
images.push(tensor);
}
let images = Tensor::stack(&images, 0)?;
Ok(images)
}

pub fn main() -> anyhow::Result<()> {
let args = Args::parse();
let model_file = match args.model {
None => {
let api = hf_hub::api::sync::Api::new()?;
let api = api.model("google/siglip-base-patch16-224".to_string());
api.get("model.safetensors")?
}
Some(model) => model.into(),
};
let tokenizer = get_tokenizer(args.tokenizer)?;
let config = siglip::Config::base_patch16_224();
let device = candle_examples::device(args.cpu)?;
let vec_imgs = match args.images {
Some(imgs) => imgs,
None => vec![
"candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg".to_string(),
"candle-examples/examples/yolo-v8/assets/bike.jpg".to_string(),
],
};
let images = load_images(&vec_imgs, config.vision_config.image_size)?.to_device(&device)?;
let vb =
unsafe { VarBuilder::from_mmaped_safetensors(&[model_file.clone()], DType::F32, &device)? };
let model = siglip::Model::new(&config, vb)?;
let (input_ids, vec_seq) = tokenize_sequences(&config, args.sequences, &tokenizer, &device)?;
let (_logits_per_text, logits_per_image) = model.forward(&images, &input_ids)?;
let softmax_image = softmax(&logits_per_image, 1)?;
let softmax_image_vec = softmax_image.flatten_all()?.to_vec1::<f32>()?;
println!("softmax_image_vec: {:?}", softmax_image_vec);
let probability_vec = softmax_image_vec
.iter()
.map(|v| v * 100.0)
.collect::<Vec<f32>>();
let probability_per_image = probability_vec.len() / vec_imgs.len();
for (i, img) in vec_imgs.iter().enumerate() {
let start = i * probability_per_image;
let end = start + probability_per_image;
let prob = &probability_vec[start..end];
println!("\n\nResults for image: {}\n", img);
for (i, p) in prob.iter().enumerate() {
println!("Probability: {:.4}% Text: {} ", p, vec_seq[i]);
}
}
Ok(())
}

pub fn get_tokenizer(tokenizer: Option<String>) -> anyhow::Result<Tokenizer> {
let tokenizer = match tokenizer {
None => {
let api = hf_hub::api::sync::Api::new()?;
let api = api.model("google/siglip-base-patch16-224".to_string());
api.get("tokenizer.json")?
}
Some(file) => file.into(),
};

Tokenizer::from_file(tokenizer).map_err(E::msg)
}

pub fn tokenize_sequences(
config: &siglip::Config,
sequences: Option<Vec<String>>,
tokenizer: &Tokenizer,
device: &Device,
) -> anyhow::Result<(Tensor, Vec<String>)> {
let pad_id = config.text_config.pad_token_id;
let vec_seq = match sequences {
Some(seq) => seq,
None => vec![
"a cycling race".to_string(),
"a photo of two cats".to_string(),
"a robot holding a candle".to_string(),
],
};
let mut tokens = vec![];
for seq in vec_seq.clone() {
let encoding = tokenizer.encode(seq, true).map_err(E::msg)?;
tokens.push(encoding.get_ids().to_vec());
}
let max_len = config.text_config.max_position_embeddings;
// Pad the sequences to have the same length
for token_vec in tokens.iter_mut() {
let len_diff = max_len - token_vec.len();
if len_diff > 0 {
token_vec.extend(vec![pad_id; len_diff]);
}
}
let input_ids = Tensor::new(tokens, device)?;
Ok((input_ids, vec_seq))
}
5 changes: 0 additions & 5 deletions candle-transformers/src/models/clip/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,28 +92,23 @@ impl ClipConfig {
impl ClipModel {
pub fn new(vs: candle_nn::VarBuilder, c: &ClipConfig) -> Result<Self> {
let text_model = ClipTextTransformer::new(vs.pp("text_model"), &c.text_config)?;

let vision_model = ClipVisionTransformer::new(vs.pp("vision_model"), &c.vision_config)?;

let visual_projection = candle_nn::linear_no_bias(
c.vision_config.embed_dim,
c.vision_config.projection_dim,
vs.pp("visual_projection"),
)?;

let text_projection = candle_nn::linear_no_bias(
c.text_config.embed_dim,
c.text_config.projection_dim,
vs.pp("text_projection"),
)?;

// originally nn.Parameter
let logit_scale = if vs.contains_tensor("logit_scale") {
vs.get(&[], "logit_scale")?
} else {
Tensor::new(&[c.logit_scale_init_value], vs.device())?
};

Ok(Self {
text_model,
vision_model,
Expand Down
4 changes: 2 additions & 2 deletions candle-transformers/src/models/clip/text_model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ impl ClipTextEmbeddings {
)?;
let position_ids =
Tensor::arange(0u32, c.max_position_embeddings as u32, vs.device())?.unsqueeze(0)?;
Ok(ClipTextEmbeddings {
Ok(Self {
token_embedding,
position_embedding,
position_ids,
Expand Down Expand Up @@ -298,7 +298,7 @@ impl ClipTextTransformer {
})
}

// TODO: rewrrite to newer version
// TODO: rewrite to newer version
fn build_causal_attention_mask(
bsz: usize,
seq_len: usize,
Expand Down
12 changes: 6 additions & 6 deletions candle-transformers/src/models/fastvit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ use candle_nn::{
BatchNorm, Conv2d, Conv2dConfig, Func, VarBuilder,
};

#[derive(Clone, Debug)]
#[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
pub struct Config {
exp_ratio: usize,
in_channels: usize,
blocks: [usize; 4],
attn: bool,
lkc_use_act: bool,
pub exp_ratio: usize,
pub in_channels: usize,
pub blocks: [usize; 4],
pub attn: bool,
pub lkc_use_act: bool,
}

impl Config {
Expand Down
Loading

0 comments on commit 261ed65

Please sign in to comment.