guillaume-be · guillaume-be · Nov 26, 2023 · Nov 25, 2023 · Nov 25, 2023 · Nov 25, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,12 +5,17 @@ All notable changes to this project will be documented in this file. The format
 ## Added
 - Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines.
 - Support for [Tokenizers](https://github.com/huggingface/tokenizers) in pipelines, allowing loading `tokenizer.json` and `special_token_map.json` tokenizer files. 
+- (BREAKING) Most model configuration can now take an optional `kind` parameter to specify the model weight precision. If not provided, will default to full precision on CPU, or the serialized weights precision otherwise.
 
 ## Fixed
 - (BREAKING) Fixed the keyword extraction pipeline for n-gram sizes > 2. Add new configuration option `tokenizer_forbidden_ngram_chars` to specify characters that should be excluded from n-grams (allows filtering m-grams spanning multiple sentences).
 - Improved MPS device compatibility setting the `sparse_grad` flag to false for `gather` operations
 - Updated ONNX runtime backend version to 1.15.x
 - Issue with incorrect results for QA models with a tokenizer not using segment ids
+- Issue with GPT-J that was incorrectly tracking the gradients for the attention bias
+
+## Changed
+- (BREAKING) Upgraded to `torch` 2.1 (via `tch` 0.14.0).
 
 ## [0.21.0] - 2023-06-03
 ## Added

diff --git a/Cargo.toml b/Cargo.toml
@@ -76,7 +76,7 @@ features = ["doc-only"]
 
 [dependencies]
 rust_tokenizers = "8.1.1"
-tch = "0.13.0"
+tch = "0.14.0"
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }
 ordered-float = "3"
@@ -97,7 +97,7 @@ anyhow = "1"
 csv = "1"
 criterion = "0.4"
 tokio = { version = "1.24", features = ["sync", "rt-multi-thread", "macros"] }
-torch-sys = "0.13.0"
+torch-sys =  "0.14.0"
 tempfile = "3"
 itertools = "0.10"
 tracing-subscriber = { version = "0.3", default-features = false, features = [ "env-filter", "fmt" ] }

diff --git a/README.md b/README.md
@@ -80,8 +80,8 @@ This cache location defaults to `~/.cache/.rustbert`, but can be changed by sett
 
 ### Manual installation (recommended)
 
-1. Download `libtorch` from https://pytorch.org/get-started/locally/. This package requires `v2.0.0`: if this version is no longer available on the "get started" page,
-the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcu118.zip` for a Linux version with CUDA11. **NOTE:** When using `rust-bert` as dependency from [crates.io](https://crates.io), please check the required `LIBTORCH` on the published package [readme](https://crates.io/crates/rust-bert) as it may differ from the version documented here (applying to the current repository version).
+1. Download `libtorch` from https://pytorch.org/get-started/locally/. This package requires `v2.1`: if this version is no longer available on the "get started" page,
+the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.1%2Bcu118.zip` for a Linux version with CUDA11. **NOTE:** When using `rust-bert` as dependency from [crates.io](https://crates.io), please check the required `LIBTORCH` on the published package [readme](https://crates.io/crates/rust-bert) as it may differ from the version documented here (applying to the current repository version).
 2. Extract the library to a location of your choice
 3. Set the following environment variables
 ##### Linux:

diff --git a/benches/generation_benchmark.rs b/benches/generation_benchmark.rs
@@ -37,6 +37,7 @@ fn create_text_generation_model() -> TextGenerationModel {
         diversity_penalty: None,
         num_return_sequences: 5,
         device: Device::cuda_if_available(),
+        kind: None,
     };
     TextGenerationModel::new(config).unwrap()
 }

diff --git a/examples/natural_language_inference_deberta.rs b/examples/natural_language_inference_deberta.rs
@@ -38,7 +38,7 @@ fn main() -> anyhow::Result<()> {
     )?;
     let config = DebertaConfig::from_file(config_path);
     let model = DebertaForSequenceClassification::new(vs.root(), &config)?;
-    load_weights(&model_resource, &mut vs)?;
+    load_weights(&model_resource, &mut vs, None, device)?;
 
     //    Define input
     let input = [("I love you.", "I like you.")];

diff --git a/src/common/resources/mod.rs b/src/common/resources/mod.rs
@@ -30,6 +30,7 @@ use std::ops::DerefMut;
 use std::path::PathBuf;
 use std::sync::RwLockWriteGuard;
 use tch::nn::VarStore;
+use tch::{Device, Kind};
 
 pub enum Resource<'a> {
     PathBuf(PathBuf),
@@ -84,17 +85,19 @@ impl<T: ResourceProvider + ?Sized> ResourceProvider for Box<T> {
 pub fn load_weights(
     rp: &(impl ResourceProvider + ?Sized),
     vs: &mut VarStore,
+    kind: Option<Kind>,
+    device: Device,
 ) -> Result<(), RustBertError> {
     match rp.get_resource()? {
-        Resource::Buffer(mut data) => {
-            vs.load_from_stream(std::io::Cursor::new(data.deref_mut()))?;
-            Ok(())
-        }
-        Resource::PathBuf(path) => Ok(vs.load(path)?),
-    }
+        Resource::Buffer(mut data) => vs.load_from_stream(std::io::Cursor::new(data.deref_mut())),
+        Resource::PathBuf(path) => vs.load(path),
+    }?;
+    cast_var_store(vs, kind, device);
+    Ok(())
 }
 
 #[cfg(feature = "remote")]
 mod remote;
+use crate::pipelines::common::cast_var_store;
 #[cfg(feature = "remote")]
 pub use remote::RemoteResource;
diff --git a/src/lib.rs b/src/lib.rs
@@ -90,8 +90,8 @@
 //!
 //! ### Manual installation (recommended)
 //!
-//! 1. Download `libtorch` from <https://pytorch.org/get-started/locally/>. This package requires `v2.0`: if this version is no longer available on the "get started" page,
-//! the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcu118.zip` for a Linux version with CUDA11.
+//! 1. Download `libtorch` from <https://pytorch.org/get-started/locally/>. This package requires `v2.1`: if this version is no longer available on the "get started" page,
+//! the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.1%2Bcu118.zip` for a Linux version with CUDA11.
 //! 2. Extract the library to a location of your choice
 //! 3. Set the following environment variables
 //! ##### Linux:

diff --git a/src/models/bart/bart_model.rs b/src/models/bart/bart_model.rs
@@ -1004,7 +1004,12 @@ impl BartGenerator {
         let mut var_store = nn::VarStore::new(device);
         let config = BartConfig::from_file(config_path);
         let model = BartForConditionalGeneration::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
         let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/gpt2/gpt2_model.rs b/src/models/gpt2/gpt2_model.rs
@@ -652,7 +652,12 @@ impl GPT2Generator {
 
         let config = Gpt2Config::from_file(config_path);
         let model = GPT2LMHeadModel::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = tokenizer.get_bos_id();
         let eos_token_ids = tokenizer.get_eos_id().map(|id| vec![id]);

diff --git a/src/models/gpt_j/attention.rs b/src/models/gpt_j/attention.rs
@@ -68,11 +68,16 @@ impl GptJAttention {
         let p = p.borrow();
 
         let max_positions = config.n_positions;
-        let bias = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device()))
+        let bias_value = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device()))
             .tril(0)
             .view([1, 1, max_positions, max_positions])
             .requires_grad_(false);
-        let bias = p.var_copy("bias", &bias);
+        let mut bias = p
+            .f_ones_no_train("bias", &[1, 1, max_positions, max_positions])
+            .unwrap()
+            .to_kind(Kind::Uint8)
+            .to_device(p.device());
+        bias.copy_(&bias_value);
 
         let attn_pdrop = config.attn_pdrop.unwrap_or(0.1);
         let resid_pdrop = config.resid_pdrop.unwrap_or(0.1);
@@ -95,21 +100,9 @@ impl GptJAttention {
             ..Default::default()
         };
         let k_proj = nn::linear(p / "k_proj", config.n_embd, config.n_embd, linear_config);
-        if config.use_float16 {
-            (p / "k_proj").half();
-        }
         let v_proj = nn::linear(p / "v_proj", config.n_embd, config.n_embd, linear_config);
-        if config.use_float16 {
-            (p / "v_proj").half();
-        }
         let q_proj = nn::linear(p / "q_proj", config.n_embd, config.n_embd, linear_config);
-        if config.use_float16 {
-            (p / "q_proj").half();
-        }
         let out_proj = nn::linear(p / "out_proj", config.n_embd, config.n_embd, linear_config);
-        if config.use_float16 {
-            (p / "out_proj").half();
-        }
 
         GptJAttention {
             bias,

diff --git a/src/models/gpt_j/gpt_j_model.rs b/src/models/gpt_j/gpt_j_model.rs
@@ -131,8 +131,6 @@ pub struct GptJConfig {
     pub rotary_dim: Option<i64>,
     pub vocab_size: i64,
     pub scale_attn_weights: Option<bool>,
-    #[serde(default = "default_use_float16")]
-    pub use_float16: bool,
     #[serde(default = "default_preload_on_cpu")]
     pub preload_on_cpu: bool,
     pub decoder_start_token_id: Option<i64>,
@@ -164,7 +162,6 @@ impl Default for GptJConfig {
             rotary_dim: Some(64),
             vocab_size: 50400,
             scale_attn_weights: Some(true),
-            use_float16: default_use_float16(),
             preload_on_cpu: default_preload_on_cpu(),
             decoder_start_token_id: None,
             forced_bos_token_id: None,
@@ -173,10 +170,6 @@ impl Default for GptJConfig {
     }
 }
 
-fn default_use_float16() -> bool {
-    true
-}
-
 fn default_preload_on_cpu() -> bool {
     true
 }
@@ -233,9 +226,6 @@ impl GptJModel {
             config.n_embd,
             Default::default(),
         );
-        if config.use_float16 {
-            (&(&p / "wte") / "weight").half()
-        };
 
         let embd_pdrop = config.embd_pdrop.unwrap_or(0.1);
         let drop = Dropout::new(embd_pdrop);
@@ -245,9 +235,6 @@ impl GptJModel {
             ..Default::default()
         };
         let ln_f = nn::layer_norm(&p / "ln_f", vec![config.n_embd], layer_norm_config);
-        if config.use_float16 {
-            (&p / "ln_f").half()
-        };
 
         let mut h: Vec<GptJBlock> = vec![];
         let h_path = &p / "h";
@@ -475,9 +462,6 @@ impl GptJLMHeadModel {
             config.vocab_size,
             Default::default(),
         );
-        if config.use_float16 {
-            (p / "lm_head").half();
-        }
 
         GptJLMHeadModel {
             transformer,
@@ -625,7 +609,12 @@ impl GptJGenerator {
         if config.preload_on_cpu && device != Device::Cpu {
             var_store.set_device(Device::Cpu);
         }
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
         if device != Device::Cpu {
             var_store.set_device(device);
         }

diff --git a/src/models/gpt_j/transformer.rs b/src/models/gpt_j/transformer.rs
@@ -43,18 +43,12 @@ impl GptJMLP {
             intermediate_size,
             Default::default(),
         );
-        if config.use_float16 {
-            (p / "fc_in").half()
-        };
         let fc_out = nn::linear(
             p / "fc_out",
             intermediate_size,
             config.n_embd,
             Default::default(),
         );
-        if config.use_float16 {
-            (p / "fc_out").half()
-        };
 
         let activation = match &config.afn {
             Some(activation_enum) => match activation_enum {
@@ -100,9 +94,6 @@ impl GptJBlock {
             ..Default::default()
         };
         let ln_1 = nn::layer_norm(p / "ln_1", vec![config.n_embd], layer_norm_config);
-        if config.use_float16 {
-            (p / "ln_1").half()
-        };
         let attn = GptJAttention::new(p / "attn", config);
         let mlp = GptJMLP::new(p / "mlp", config);
 

diff --git a/src/models/gpt_neo/gpt_neo_model.rs b/src/models/gpt_neo/gpt_neo_model.rs
@@ -672,7 +672,12 @@ impl GptNeoGenerator {
         let mut var_store = nn::VarStore::new(device);
         let config = GptNeoConfig::from_file(config_path);
         let model = GptNeoForCausalLM::new(var_store.root(), &config)?;
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = tokenizer.get_bos_id();
         let eos_token_ids = tokenizer.get_eos_id().map(|id| vec![id]);

diff --git a/src/models/longt5/encoder.rs b/src/models/longt5/encoder.rs
@@ -288,8 +288,8 @@ impl LongT5Stack {
 
         let (batch_size, sequence_length) = (input_shape[0], input_shape[1]);
 
-        let mask_seq_length = if old_layer_states.is_some() {
-            if old_layer_states.as_ref().unwrap()[0].0.is_some() {
+        let mask_seq_length = if let Some(old_layer_states_value) = &old_layer_states {
+            if old_layer_states_value[0].0.is_some() {
                 old_layer_states.as_ref().unwrap()[0]
                     .0
                     .as_ref()

diff --git a/src/models/longt5/longt5_model.rs b/src/models/longt5/longt5_model.rs
@@ -595,7 +595,12 @@ impl LongT5Generator {
 
         let config = LongT5Config::from_file(config_path);
         let model = LongT5ForConditionalGeneration::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = config.bos_token_id;
         let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/m2m_100/m2m_100_model.rs b/src/models/m2m_100/m2m_100_model.rs
@@ -544,7 +544,12 @@ impl M2M100Generator {
 
         let config = M2M100Config::from_file(config_path);
         let model = M2M100ForConditionalGeneration::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
         let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/marian/marian_model.rs b/src/models/marian/marian_model.rs
@@ -761,7 +761,12 @@ impl MarianGenerator {
 
         let config = BartConfig::from_file(config_path);
         let model = MarianForConditionalGeneration::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
         let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/mbart/mbart_model.rs b/src/models/mbart/mbart_model.rs
@@ -650,7 +650,7 @@ impl MBartForSequenceClassification {
     /// # let device = Device::Cpu;
     /// # let vs = nn::VarStore::new(device);
     /// # let config = MBartConfig::from_file(config_path);
-    /// # let mbart_model: MBartForSequenceClassification = MBartForSequenceClassification::new(&vs.root(), &config).unwrap();;
+    /// # let mbart_model: MBartForSequenceClassification = MBartForSequenceClassification::new(&vs.root(), &config).unwrap();
     ///  let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
     ///  let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
     ///  let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
@@ -800,7 +800,12 @@ impl MBartGenerator {
 
         let config = MBartConfig::from_file(config_path);
         let model = MBartForConditionalGeneration::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
         let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/openai_gpt/openai_gpt_model.rs b/src/models/openai_gpt/openai_gpt_model.rs
@@ -498,7 +498,12 @@ impl OpenAIGenerator {
         let mut var_store = nn::VarStore::new(device);
         let config = Gpt2Config::from_file(config_path);
         let model = OpenAIGPTLMHeadModel::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = tokenizer.get_bos_id();
         let eos_token_ids = tokenizer.get_eos_id().map(|id| vec![id]);

diff --git a/src/models/pegasus/pegasus_model.rs b/src/models/pegasus/pegasus_model.rs
@@ -505,7 +505,12 @@ impl PegasusConditionalGenerator {
         let mut var_store = nn::VarStore::new(device);
         let config = PegasusConfig::from_file(config_path);
         let model = PegasusForConditionalGeneration::new(var_store.root(), &config);
-        crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+        crate::resources::load_weights(
+            &generate_config.model_resource,
+            &mut var_store,
+            generate_config.kind,
+            device,
+        )?;
 
         let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
         let eos_token_ids = config