From 492a17871ef1c904ea4f4d75e37aeeca783ed22c Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Thu, 20 Feb 2025 18:01:11 +0100
Subject: [PATCH 01/28] feat(huggingface): Added first version of a true
 conversion of the modalities gpt2 model into a llama style huggingface model.

---
 src/modalities/conversion/__init__.py         |    0
 src/modalities/conversion/gpt2/__init__.py    |    0
 .../conversion/gpt2/configuration_gpt2.py     |  226 +++
 .../conversion/gpt2/convert_gpt2.py           |  128 ++
 .../conversion/gpt2/modeling_gpt2.py          | 1468 +++++++++++++++++
 tests/conversion/__init__.py                  |    0
 tests/conversion/gpt2/__init__.py             |    0
 tests/conversion/gpt2/test_convert_gpt2.py    |   26 +
 8 files changed, 1848 insertions(+)
 create mode 100644 src/modalities/conversion/__init__.py
 create mode 100644 src/modalities/conversion/gpt2/__init__.py
 create mode 100755 src/modalities/conversion/gpt2/configuration_gpt2.py
 create mode 100644 src/modalities/conversion/gpt2/convert_gpt2.py
 create mode 100644 src/modalities/conversion/gpt2/modeling_gpt2.py
 create mode 100644 tests/conversion/__init__.py
 create mode 100644 tests/conversion/gpt2/__init__.py
 create mode 100644 tests/conversion/gpt2/test_convert_gpt2.py

diff --git a/src/modalities/conversion/__init__.py b/src/modalities/conversion/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/modalities/conversion/gpt2/__init__.py b/src/modalities/conversion/gpt2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/modalities/conversion/gpt2/configuration_gpt2.py b/src/modalities/conversion/gpt2/configuration_gpt2.py
new file mode 100755
index 000000000..9ead00b3e
--- /dev/null
+++ b/src/modalities/conversion/gpt2/configuration_gpt2.py
@@ -0,0 +1,226 @@
+# coding=utf-8
+# This code was copied and modified from the Llama implementation of the Hugging Face Transformers library.
+# The original code can be found at:
+#   https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/configuration_llama.py
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LLaMA-like GPT2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class GPT2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPT2Model`]. It is used to instantiate an GPT2
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the GPT2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+
+    ```python
+    >>> from transformers import GPT2Model, GPT2Config
+
+    >>> # Initializing a GPT2 with a llama-7b style configuration
+    >>> configuration = GPT2Config()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = GPT2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `GPT2Model`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=None,
+        layer_norm_eps: float = 1e-06,
+        layer_norm_bias: bool = True,
+        layer_norm_elementwise_affine: bool = True,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        **kwargs,
+    ):
+        if rms_norm_eps is not None:
+            raise ValueError("RMSNorm is not supported in GPT2 model.")
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.layer_norm_eps = layer_norm_eps
+        self.layer_norm_bias = layer_norm_bias
+        self.layer_norm_elementwise_affine = layer_norm_elementwise_affine
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
new file mode 100644
index 000000000..41beed53f
--- /dev/null
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -0,0 +1,128 @@
+import argparse
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from modalities.config.config import load_app_config_dict
+from modalities.conversion.gpt2.configuration_gpt2 import GPT2Config
+from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
+from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block, PositionTypes
+from modalities.models.model import SwiGLU
+from modalities.models.utils import ModelTypeEnum, get_model_from_config
+
+
+def convert_model_checkpoint(modalities_config: dict) -> Tuple[GPT2ForCausalLM, GPT2LLM]:
+    gpt2_config = convert_model_config(modalities_config)
+    hf_model = GPT2ForCausalLM(gpt2_config).to(dtype=torch.bfloat16)
+    modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
+    _copy_weights_model(hf_model, modalities_model)
+    return hf_model, modalities_model
+
+
+def convert_model_config(modalities_config: dict) -> GPT2Config:
+    assert modalities_config["model_raw"]["config"]["poe_type"] == PositionTypes.NOPE
+    assert modalities_config["model_raw"]["config"]["activation_type"] == "swiglu"
+
+    return GPT2Config(
+        vocab_size=modalities_config["model_raw"]["config"]["vocab_size"],
+        hidden_size=modalities_config["model_raw"]["config"]["n_embd"],
+        pad_token_id=None,
+        num_hidden_layers=modalities_config["model_raw"]["config"]["n_layer"],
+        num_key_value_heads=modalities_config["model_raw"]["config"]["n_head_kv"],
+        num_attention_heads=modalities_config["model_raw"]["config"]["n_head_q"],
+        intermediate_size=SwiGLU._get_hidden_dim(ffn_hidden=modalities_config["model_raw"]["config"]["ffn_hidden"]),
+        mlp_bias=modalities_config["model_raw"]["config"]["bias"],
+        hidden_act="silu",
+        layer_norm_eps=modalities_config["model_raw"]["config"]["ffn_norm"]["config"]["eps"],
+        layer_norm_elementwise_affine=modalities_config["model_raw"]["config"]["ffn_norm"]["config"].get(
+            "elementwise_affine", True
+        ),
+        layer_norm_bias=modalities_config["model_raw"]["config"]["ffn_norm"]["config"].get("bias", True),
+        max_position_embeddings=modalities_config["model_raw"]["config"]["sequence_length"],
+        rope_theta=modalities_config["model_raw"]["config"]["attention_config"]["qkv_transforms"][0]["config"][
+            "base_freq"
+        ],
+        _attn_implementation="sdpa",
+        output_attentions=False,
+    )
+
+
+def test_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, num_testruns: int, vocab_size: int):
+    for _ in tqdm(range(num_testruns), desc="Testing converted model"):
+        input_ids = torch.randint(0, vocab_size, (1, 1024), device=hf_model.device)
+        inputs = {modalities_model.sample_key: input_ids.to(modalities_model.transformer.wte.weight.device)}
+
+        with torch.no_grad():
+            llama_logits = hf_model(input_ids=input_ids).logits.to("cpu")
+            modalities_logits = modalities_model(inputs)[modalities_model.prediction_key].to("cpu")
+
+        assert llama_logits.shape == modalities_logits.shape
+        assert torch.equal(llama_logits, modalities_logits)
+
+
+def _copy_weights_model(hf_model_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
+    hf_model_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)
+    for hf_layer, modalities_layer in zip(hf_model_model.model.layers, modalities_model.transformer.h):
+        _copy_weights_attention(hf_layer, modalities_layer)
+        _copy_weights_mlp(hf_layer, modalities_layer)
+        _copy_weights_layer_norms(hf_layer, modalities_layer)
+    _copy_weights_base_modules(hf_model_model.lm_head, modalities_model.lm_head)
+    _copy_weights_base_modules(hf_model_model.model.norm, modalities_model.transformer.lm_head_norm)
+
+
+def _copy_weights_attention(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    _copy_weights_base_modules(hf_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
+    _copy_weights_base_modules(hf_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
+    _copy_weights_base_modules(hf_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
+    _copy_weights_base_modules(hf_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
+
+
+def _copy_weights_mlp(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    _copy_weights_base_modules(hf_layer.mlp.down_proj, modalities_layer.mlp.W_2)
+    _copy_weights_base_modules(hf_layer.mlp.gate_proj, modalities_layer.mlp.W)
+    _copy_weights_base_modules(hf_layer.mlp.up_proj, modalities_layer.mlp.V)
+
+
+def _copy_weights_layer_norms(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    _copy_weights_base_modules(hf_layer.input_layernorm, modalities_layer.attention_norm)
+    _copy_weights_base_modules(hf_layer.post_attention_layernorm, modalities_layer.ffn_norm)
+
+
+def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.LayerNorm):
+    assert m1.weight.shape == m2.weight.shape
+    assert (m1.bias is None and m2.bias is None) or m1.bias.shape == m2.bias.shape
+    m1.weight.data.copy_(m2.weight.data)
+    if m1.bias is not None:
+        m1.bias.data.copy_(m2.bias.data)
+
+
+if __name__ == "__main__":
+    import os
+
+    os.environ["LOCAL_RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["RANK"] = "0"
+
+    parser = argparse.ArgumentParser(description="Convert GPT-2 model checkpoint.")
+    parser.add_argument("modalities_config", type=str, help="Path to the modalities config file.")
+    parser.add_argument("output_dir", type=str, help="Directory to save the converted model.")
+    parser.add_argument("--num_testruns", type=int, default=0, help="Number of test runs to perform.")
+    parser.add_argument("--device_modalities", type=str, default="cpu", help="Device for the modalities model.")
+    parser.add_argument("--device_hf", type=str, default="cpu", help="Device for the Hugging Face model.")
+
+    args = parser.parse_args()
+
+    modalities_config = load_app_config_dict(args.modalities_config)
+    hf_model, modalities_model = convert_model_checkpoint(modalities_config)
+
+    if args.num_testruns > 0:
+        test_converted_model(
+            hf_model.to(args.device_hf),
+            modalities_model.to(args.device_modalities),
+            args.num_testruns,
+            modalities_config["model_raw"]["config"]["vocab_size"],
+        )
+
+    hf_model.save_pretrained(args.output_dir)
diff --git a/src/modalities/conversion/gpt2/modeling_gpt2.py b/src/modalities/conversion/gpt2/modeling_gpt2.py
new file mode 100644
index 000000000..7d0ba09c0
--- /dev/null
+++ b/src/modalities/conversion/gpt2/modeling_gpt2.py
@@ -0,0 +1,1468 @@
+# coding=utf-8
+# This code was copied and modified from the Llama implementation of the Hugging Face Transformers library.
+# The original code can be found at:
+#   https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+# Original license information:
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from modalities.conversion.gpt2.configuration_gpt2 import GPT2Config
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import (
+    FlashAttentionKwargs,
+    _flash_attention_forward,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    LossKwargs,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf"
+_CONFIG_FOR_DOC = "GPT2Config"
+
+
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[GPT2Config] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
+            "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
+        )
+        kwargs["rope_type"] = "linear"
+        super().__init__(*args, **kwargs)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, *args, **kwargs):
+        logger.warning_once(
+            "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.46. Please use "
+            "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
+            "__init__)."
+        )
+        kwargs["rope_type"] = "dynamic"
+        super().__init__(*args, **kwargs)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GPT2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
+        self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaFlashAttention2(LlamaAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaSdpaAttention(LlamaAttention):
+    """
+    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from LlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "flash_attention_2": LlamaFlashAttention2,
+    "sdpa": LlamaSdpaAttention,
+}
+
+
+class GPT2DecoderLayer(nn.Module):
+    def __init__(self, config: GPT2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            elementwise_affine=config.layer_norm_elementwise_affine,
+            bias=config.layer_norm_bias,
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            elementwise_affine=config.layer_norm_elementwise_affine,
+            bias=config.layer_norm_bias,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class GPT2PreTrainedModel(PreTrainedModel):
+    config_class = GPT2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class GPT2Model(GPT2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: GPT2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [GPT2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = nn.LayerNorm(
+            config.hidden_size,
+            eps=config.layer_norm_eps,
+            elementwise_affine=config.layer_norm_elementwise_affine,
+            bias=config.layer_norm_bias,
+        )
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        if getattr(config, "pretraining_tp", 1) != 1:
+            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+
+
+class GPT2ForCausalLM(GPT2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GPT2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPT2ForCausalLM
+
+        >>> model = GPT2ForCausalLM.from_pretrained("...")
+        >>> tokenizer = AutoTokenizer.from_pretrained("...")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The LLaMa-like GPT2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class GPT2ForSequenceClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = GPT2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+The Llama-like Model transformer with a span classification head on top for extractive question-answering tasks like
+SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
+    base_model_prefix = "transformer"
+
+    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.transformer.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.transformer.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Llama-like GPT2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class GPT2ForTokenClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = GPT2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/tests/conversion/__init__.py b/tests/conversion/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/conversion/gpt2/__init__.py b/tests/conversion/gpt2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/conversion/gpt2/test_convert_gpt2.py b/tests/conversion/gpt2/test_convert_gpt2.py
new file mode 100644
index 000000000..2062663b1
--- /dev/null
+++ b/tests/conversion/gpt2/test_convert_gpt2.py
@@ -0,0 +1,26 @@
+import torch
+import torch.nn as nn
+from modalities.conversion.gpt2.convert_gpt2 import _copy_weights_base_modules
+
+
+def test_copying_base_modules_weights_yields_identical_modules():
+    m1 = nn.Linear(10, 10, bias=True)
+    m2 = nn.Linear(10, 10, bias=True)
+    m1.weight.data = torch.randn(10, 10)
+    m1.bias.data = torch.randn(10)
+
+    _copy_weights_base_modules(m1, m2)
+
+    assert torch.equal(m1.weight.data, m2.weight.data)
+    assert torch.equal(m1.bias.data, m2.bias.data)
+
+
+def test_copying_base_modules_works_when_bias_is_false():
+    m1 = nn.Linear(10, 10, bias=False)
+    m2 = nn.Linear(10, 10, bias=False)
+    m1.weight.data = torch.randn(10, 10)
+
+    _copy_weights_base_modules(m1, m2)
+
+    assert torch.equal(m1.weight.data, m2.weight.data)
+    assert m1.bias == m2.bias == None

From 84ac2b7b5a94b605de3a534d3cacd144d2ac3775 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Fri, 21 Feb 2025 10:47:42 +0000
Subject: [PATCH 02/28] refactor(conversion): convert_gpt2

---
 .../conversion/gpt2/convert_gpt2.py           | 77 +++++++++++--------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index 41beed53f..b41056c88 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -1,5 +1,4 @@
 import argparse
-from typing import Tuple
 
 import torch
 import torch.nn as nn
@@ -13,7 +12,7 @@
 from modalities.models.utils import ModelTypeEnum, get_model_from_config
 
 
-def convert_model_checkpoint(modalities_config: dict) -> Tuple[GPT2ForCausalLM, GPT2LLM]:
+def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM, GPT2LLM]:
     gpt2_config = convert_model_config(modalities_config)
     hf_model = GPT2ForCausalLM(gpt2_config).to(dtype=torch.bfloat16)
     modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
@@ -22,28 +21,32 @@ def convert_model_checkpoint(modalities_config: dict) -> Tuple[GPT2ForCausalLM,
 
 
 def convert_model_config(modalities_config: dict) -> GPT2Config:
-    assert modalities_config["model_raw"]["config"]["poe_type"] == PositionTypes.NOPE
-    assert modalities_config["model_raw"]["config"]["activation_type"] == "swiglu"
+    config = modalities_config["model_raw"]["config"]
+
+    assert config["poe_type"] == PositionTypes.NOPE
+    assert config["activation_type"] == "swiglu"
 
     return GPT2Config(
-        vocab_size=modalities_config["model_raw"]["config"]["vocab_size"],
-        hidden_size=modalities_config["model_raw"]["config"]["n_embd"],
+        vocab_size=config["vocab_size"],
+        hidden_size=config["n_embd"],
         pad_token_id=None,
-        num_hidden_layers=modalities_config["model_raw"]["config"]["n_layer"],
-        num_key_value_heads=modalities_config["model_raw"]["config"]["n_head_kv"],
-        num_attention_heads=modalities_config["model_raw"]["config"]["n_head_q"],
-        intermediate_size=SwiGLU._get_hidden_dim(ffn_hidden=modalities_config["model_raw"]["config"]["ffn_hidden"]),
-        mlp_bias=modalities_config["model_raw"]["config"]["bias"],
+        num_hidden_layers=config["n_layer"],
+        num_key_value_heads=config["n_head_kv"],
+        num_attention_heads=config["n_head_q"],
+        intermediate_size=SwiGLU._get_hidden_dim(ffn_hidden=config["ffn_hidden"]),
+        mlp_bias=config["bias"],
         hidden_act="silu",
-        layer_norm_eps=modalities_config["model_raw"]["config"]["ffn_norm"]["config"]["eps"],
-        layer_norm_elementwise_affine=modalities_config["model_raw"]["config"]["ffn_norm"]["config"].get(
-            "elementwise_affine", True
+        layer_norm_eps=config["ffn_norm"]["config"]["eps"],
+        layer_norm_elementwise_affine=config["ffn_norm"]["config"].get(
+            "elementwise_affine",
+            True,
+            # TODO:
+            # Temporary solution: double-check that these are the correct default values.
+            # Permanent solution: read default values from where they are defined.
         ),
-        layer_norm_bias=modalities_config["model_raw"]["config"]["ffn_norm"]["config"].get("bias", True),
-        max_position_embeddings=modalities_config["model_raw"]["config"]["sequence_length"],
-        rope_theta=modalities_config["model_raw"]["config"]["attention_config"]["qkv_transforms"][0]["config"][
-            "base_freq"
-        ],
+        layer_norm_bias=config["ffn_norm"]["config"].get("bias", True),  # TODO: see comment above
+        max_position_embeddings=config["sequence_length"],
+        rope_theta=config["attention_config"]["qkv_transforms"][0]["config"]["base_freq"],
         _attn_implementation="sdpa",
         output_attentions=False,
     )
@@ -98,6 +101,23 @@ def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.
         m1.bias.data.copy_(m2.bias.data)
 
 
+def convert_gpt2(
+    modalities_config_path: str, output_dir: str, num_testruns: int, device_modalities: str, device_hf: str
+) -> None:
+    modalities_config = load_app_config_dict(modalities_config_path)
+    hf_model, modalities_model = convert_model_checkpoint(modalities_config)
+
+    if num_testruns > 0:
+        test_converted_model(
+            hf_model.to(device_hf),
+            modalities_model.to(device_modalities),
+            num_testruns,
+            modalities_config["model_raw"]["config"]["vocab_size"],
+        )
+
+    hf_model.save_pretrained(output_dir)
+
+
 if __name__ == "__main__":
     import os
 
@@ -114,15 +134,10 @@ def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.
 
     args = parser.parse_args()
 
-    modalities_config = load_app_config_dict(args.modalities_config)
-    hf_model, modalities_model = convert_model_checkpoint(modalities_config)
-
-    if args.num_testruns > 0:
-        test_converted_model(
-            hf_model.to(args.device_hf),
-            modalities_model.to(args.device_modalities),
-            args.num_testruns,
-            modalities_config["model_raw"]["config"]["vocab_size"],
-        )
-
-    hf_model.save_pretrained(args.output_dir)
+    convert_gpt2(
+        args.modalities_config,
+        args.output_dir,
+        args.num_testruns,
+        args.device_modalities,
+        args.device_hf,
+    )

From d516469fa32aa42b06f968364d2787c7d1606dbf Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Fri, 21 Feb 2025 12:14:44 +0100
Subject: [PATCH 03/28] fix(huggingface): Set correct model_type in GPT2Config.

---
 src/modalities/conversion/gpt2/configuration_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100755 => 100644 src/modalities/conversion/gpt2/configuration_gpt2.py

diff --git a/src/modalities/conversion/gpt2/configuration_gpt2.py b/src/modalities/conversion/gpt2/configuration_gpt2.py
old mode 100755
new mode 100644
index 9ead00b3e..7663cd227
--- a/src/modalities/conversion/gpt2/configuration_gpt2.py
+++ b/src/modalities/conversion/gpt2/configuration_gpt2.py
@@ -141,7 +141,7 @@ class GPT2Config(PretrainedConfig):
     >>> configuration = model.config
     ```"""
 
-    model_type = "llama"
+    model_type = "modalities-gpt2"
     keys_to_ignore_at_inference = ["past_key_values"]
     # Default tensor parallel plan for base model `GPT2Model`
     base_model_tp_plan = {

From 49191d139b9bbcbe321ee9a69433dc244009f996 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Fri, 21 Feb 2025 12:26:51 +0100
Subject: [PATCH 04/28] refactor(huggingface): Shortened some long lines.

---
 .../conversion/gpt2/modeling_gpt2.py          | 51 +++++++++++--------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/modalities/conversion/gpt2/modeling_gpt2.py b/src/modalities/conversion/gpt2/modeling_gpt2.py
index 7d0ba09c0..f803d2f07 100644
--- a/src/modalities/conversion/gpt2/modeling_gpt2.py
+++ b/src/modalities/conversion/gpt2/modeling_gpt2.py
@@ -26,16 +26,12 @@
 
 import torch
 import torch.utils.checkpoint
-from modalities.conversion.gpt2.configuration_gpt2 import GPT2Config
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache, StaticCache
 from transformers.generation import GenerationMixin
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.modeling_flash_attention_utils import (
-    FlashAttentionKwargs,
-    _flash_attention_forward,
-)
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -56,9 +52,11 @@
     replace_return_docstrings,
 )
 
+from modalities.conversion.gpt2.configuration_gpt2 import GPT2Config
+
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf"
+_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf"  # TODO: update to the actual checkpoint
 _CONFIG_FOR_DOC = "GPT2Config"
 
 
@@ -350,8 +348,12 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is
+        # needed here is bottom-right alignement, that was made default for flash_attn>=2.1.
+        # This attribute is used to handle this difference. Reference:
+        # https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen
+        # (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
     def forward(
@@ -404,7 +406,8 @@ def forward(
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout
+        # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
         # to be able to avoid many of these transpose/reshape/view.
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
@@ -482,10 +485,13 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"`
+            # once this is implemented.
             logger.warning_once(
-                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does "
+                "not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                "but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
+                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
             )
             return super().forward(
                 hidden_states=hidden_states,
@@ -533,15 +539,18 @@ def forward(
         if attention_mask is not None:
             causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
 
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # SDPA with memory-efficient backend is currently (torch==2.1.2)
+        # bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and causal_mask is not None:
             query_states = query_states.contiguous()
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal`
+        # if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options.
+        # An inline conditional prevents dynamic shapes from compiling.
         is_causal = True if causal_mask is None and q_len > 1 else False
 
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -1065,7 +1074,8 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         return causal_mask
 
 
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
+    ...
 
 
 class GPT2ForCausalLM(GPT2PreTrainedModel, GenerationMixin):
@@ -1126,8 +1136,9 @@ def forward(
 
             num_logits_to_keep (`int`, *optional*):
                 Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating
+                them only for that token can save memory, which becomes pretty significant for long sequences
+                or large vocabulary size.
 
         Returns:
 
@@ -1382,8 +1393,8 @@ def forward(
 
 @add_start_docstrings(
     """
-    The Llama-like GPT2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
-    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    The Llama-like GPT2 Model transformer with a token classification head on top (a linear layer
+    on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
     """,
     LLAMA_START_DOCSTRING,
 )

From 7332916fb041c9a1fe98b22ad41796e11e001cfb Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Fri, 21 Feb 2025 12:29:12 +0100
Subject: [PATCH 05/28] feat(huggingface): When converting gpt2 now all the
 necessary model code gets included in the hf checkpoint.

---
 .../conversion/gpt2/convert_gpt2.py           | 50 +++++++++++++------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index b41056c88..9c86d67b5 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -1,4 +1,5 @@
 import argparse
+import shutil
 
 import torch
 import torch.nn as nn
@@ -12,6 +13,29 @@
 from modalities.models.utils import ModelTypeEnum, get_model_from_config
 
 
+def convert_gpt2(
+    modalities_config_path: str, output_dir: str, num_testruns: int, device_modalities: str, device_hf: str
+) -> None:
+    modalities_config = load_app_config_dict(modalities_config_path)
+    hf_model, modalities_model = convert_model_checkpoint(modalities_config)
+
+    if num_testruns > 0:
+        test_converted_model(
+            hf_model.to(device_hf),
+            modalities_model.to(device_modalities),
+            num_testruns,
+            modalities_config["model_raw"]["config"]["vocab_size"],
+        )
+
+    hf_model.config.auto_map = {
+        "AutoConfig": "configuration_gpt2.GPT2Config",
+        "AutoModel": "modeling_gpt2.GPT2Model",
+        "AutoModelForCausalLM": "modeling_gpt2.GPT2ForCausalLM",
+    }
+    hf_model.save_pretrained(output_dir)
+    _transfer_model_code(output_dir)
+
+
 def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM, GPT2LLM]:
     gpt2_config = convert_model_config(modalities_config)
     hf_model = GPT2ForCausalLM(gpt2_config).to(dtype=torch.bfloat16)
@@ -101,21 +125,19 @@ def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.
         m1.bias.data.copy_(m2.bias.data)
 
 
-def convert_gpt2(
-    modalities_config_path: str, output_dir: str, num_testruns: int, device_modalities: str, device_hf: str
-) -> None:
-    modalities_config = load_app_config_dict(modalities_config_path)
-    hf_model, modalities_model = convert_model_checkpoint(modalities_config)
+def _transfer_model_code(output_dir: str):
+    source_dir = os.path.dirname(__file__)
+    modeling_gpt2_path = os.path.join(source_dir, "modeling_gpt2.py")
+    configuration_gpt2_path = os.path.join(source_dir, "configuration_gpt2.py")
+    shutil.copy(modeling_gpt2_path, output_dir)
+    shutil.copy(configuration_gpt2_path, output_dir)
 
-    if num_testruns > 0:
-        test_converted_model(
-            hf_model.to(device_hf),
-            modalities_model.to(device_modalities),
-            num_testruns,
-            modalities_config["model_raw"]["config"]["vocab_size"],
-        )
-
-    hf_model.save_pretrained(output_dir)
+    target_modeling_file = os.path.join(output_dir, "modeling_gpt2.py")
+    with open(target_modeling_file, "r") as file:
+        content = file.read()
+    content = content.replace("modalities.conversion.gpt2.configuration_gpt2", ".configuration_gpt2")
+    with open(target_modeling_file, "w") as file:
+        file.write(content)
 
 
 if __name__ == "__main__":

From a390289a716db2478a7611cd2d6aad87d1d703f9 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Fri, 21 Feb 2025 12:52:17 +0100
Subject: [PATCH 06/28] docs(huggingface): Added docstrings for conversion
 script.

---
 .../conversion/gpt2/convert_gpt2.py           | 83 ++++++++++++++++++-
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index 9c86d67b5..c22888434 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -1,3 +1,23 @@
+"""
+usage: convert_gpt2.py [-h] [--num_testruns NUM_TESTRUNS] [--device_modalities DEVICE_MODALITIES]
+                       [--device_hf DEVICE_HF] modalities_config output_dir
+
+Convert GPT-2 model checkpoint to Huggingface transformers format.
+
+positional arguments:
+  modalities_config     Path to the modalities config file.
+  output_dir            Directory to save the converted model.
+
+options:
+  -h, --help            show this help message and exit
+  --num_testruns NUM_TESTRUNS
+                        Number of test runs to perform.
+  --device_modalities DEVICE_MODALITIES
+                        Device for the modalities model.
+  --device_hf DEVICE_HF
+                        Device for the Hugging Face model.
+"""
+
 import argparse
 import shutil
 
@@ -14,8 +34,24 @@
 
 
 def convert_gpt2(
-    modalities_config_path: str, output_dir: str, num_testruns: int, device_modalities: str, device_hf: str
+    modalities_config_path: str,
+    output_dir: str,
+    num_testruns: int = 0,
+    device_modalities: str = "cpu",
+    device_hf: str = "cpu",
 ) -> None:
+    """Takes a modalities gpt2 model and converts it to a Huggingface transformers model.
+       The provided config yaml file should contain the model_raw section with the model configuration.
+       Additionally, the checkpointed_model section should be present and contain the path to the model checkpoint.
+       Optionally, the function can run a number of test runs to compare the converted model with the original one.
+
+    Args:
+        modalities_config_path (str): Path to the modalities config file.
+        output_dir (str): Directory to save the converted model.
+        num_testruns (int, optional): Number of test runs to perform. Defaults to 0.
+        device_modalities (str, optional): Device for the modalities model. Defaults to "cpu".
+        device_hf (str, optional): Device for the Hugging Face model. Defaults to "cpu".
+    """
     modalities_config = load_app_config_dict(modalities_config_path)
     hf_model, modalities_model = convert_model_checkpoint(modalities_config)
 
@@ -37,6 +73,16 @@ def convert_gpt2(
 
 
 def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM, GPT2LLM]:
+    """Converts the modalities model to a Huggingface transformers model.
+       Both the loaded modalities model and the converted Huggingface model are returned
+       so that they can be compared.
+
+    Args:
+        modalities_config (dict): Modalities config dictionary.
+
+    Returns:
+        tuple[GPT2ForCausalLM, GPT2LLM]: Converted Hugging Face model and the original modalities model.
+    """
     gpt2_config = convert_model_config(modalities_config)
     hf_model = GPT2ForCausalLM(gpt2_config).to(dtype=torch.bfloat16)
     modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
@@ -45,6 +91,16 @@ def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM,
 
 
 def convert_model_config(modalities_config: dict) -> GPT2Config:
+    """Converts the modalities model configuration to a Huggingface transformers configuration.
+       For this the model_raw section of the modalities config is used.
+       Corresponding entries are mapped to the Huggingface configuration.
+
+    Args:
+        modalities_config (dict): Modalities config dictionary.
+
+    Returns:
+        GPT2Config: Converted Huggingface model configuration.
+    """
     config = modalities_config["model_raw"]["config"]
 
     assert config["poe_type"] == PositionTypes.NOPE
@@ -77,6 +133,14 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
 
 
 def test_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, num_testruns: int, vocab_size: int):
+    """Tests the converted model by inputting a random token sequence and comparing the output logits of both models.
+
+    Args:
+        hf_model (GPT2ForCausalLM): Huggingface transformers model.
+        modalities_model (GPT2LLM): Modalities model.
+        num_testruns (int): Number of test runs to perform.
+        vocab_size (int): Vocabulary size of the model. (Required for generating random input tokens.)
+    """
     for _ in tqdm(range(num_testruns), desc="Testing converted model"):
         input_ids = torch.randint(0, vocab_size, (1, 1024), device=hf_model.device)
         inputs = {modalities_model.sample_key: input_ids.to(modalities_model.transformer.wte.weight.device)}
@@ -90,6 +154,13 @@ def test_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, n
 
 
 def _copy_weights_model(hf_model_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
+    """Copies the weights of the modalities model to the Huggingface transformers model.
+
+    Args:
+        hf_model_model (GPT2ForCausalLM): The uninitialized Huggingface transformers model.
+                                          The weights will be copied here.
+        modalities_model (GPT2LLM): The modalities model from which the weights will be copied.
+    """
     hf_model_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)
     for hf_layer, modalities_layer in zip(hf_model_model.model.layers, modalities_model.transformer.h):
         _copy_weights_attention(hf_layer, modalities_layer)
@@ -126,6 +197,14 @@ def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.
 
 
 def _transfer_model_code(output_dir: str):
+    """Copies the required model code to the output directory.
+       This allows the converted model to be used without the modalities package via:
+       >>> from transformers import AutoModelForCausalLM
+       >>> model = AutoModelForCausalLM.from_pretrained("path/to/converted/model", trust_remote_code=True)
+
+    Args:
+        output_dir (str): Directory of the converted model.
+    """
     source_dir = os.path.dirname(__file__)
     modeling_gpt2_path = os.path.join(source_dir, "modeling_gpt2.py")
     configuration_gpt2_path = os.path.join(source_dir, "configuration_gpt2.py")
@@ -147,7 +226,7 @@ def _transfer_model_code(output_dir: str):
     os.environ["WORLD_SIZE"] = "1"
     os.environ["RANK"] = "0"
 
-    parser = argparse.ArgumentParser(description="Convert GPT-2 model checkpoint.")
+    parser = argparse.ArgumentParser(description="Convert GPT-2 model checkpoint to Huggingface transformers format.")
     parser.add_argument("modalities_config", type=str, help="Path to the modalities config file.")
     parser.add_argument("output_dir", type=str, help="Directory to save the converted model.")
     parser.add_argument("--num_testruns", type=int, default=0, help="Number of test runs to perform.")

From 0e4310dbdb89c0f0892a8697c330580845e16523 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Fri, 21 Feb 2025 13:49:03 +0000
Subject: [PATCH 07/28] chore(getting_started): update config (no bias, layer
 norm, swiglu, pytorch_flash attn)

---
 tutorials/getting_started/example_config.yaml | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/tutorials/getting_started/example_config.yaml b/tutorials/getting_started/example_config.yaml
index 4faeec9d6..b5b3749be 100644
--- a/tutorials/getting_started/example_config.yaml
+++ b/tutorials/getting_started/example_config.yaml
@@ -211,7 +211,7 @@ model_raw:
     ffn_hidden: 128
     n_embd: 128
     dropout: 0.0
-    bias: true # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    bias: false
     attention_config:
       qkv_transforms:
         - type_hint: RotaryTransform
@@ -220,29 +220,26 @@ model_raw:
             n_head: ${model_raw.config.n_head_q} #it has to be head_q here
             seq_length_dim: -2
             base_freq: 10000
-    attention_implementation: manual
-    activation_type: gelu
+    attention_implementation: pytorch_flash
+    activation_type: swiglu
     attention_norm:
       component_key: layer_norm
-      variant_key: rms_norm
+      variant_key: layer_norm
       config:
-        ndim: ${model_raw.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1.0e-05
     ffn_norm:
       component_key: layer_norm
-      variant_key: rms_norm
+      variant_key: layer_norm
       config:
-        ndim: ${model_raw.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1.0e-05
     lm_head_norm:
       component_key: layer_norm
-      variant_key: rms_norm
+      variant_key: layer_norm
       config:
-        ndim: ${model_raw.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model_raw.config.n_embd}
+        eps: 1.0e-05
 
 scheduler:
   component_key: scheduler
@@ -298,4 +295,4 @@ evaluation_subscriber:
     mode: OFFLINE
     experiment_id: ${settings.experiment_id}
     directory: wandb_storage
-    config_file_path: ${settings.config_file_path}
\ No newline at end of file
+    config_file_path: ${settings.config_file_path}

From d3ab1b5c16570b75e40dd0df9102ed46f76b401d Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Fri, 21 Feb 2025 14:45:55 +0000
Subject: [PATCH 08/28] test(conversion): apply checkpoint conversion based on
 getting started example

---
 tests/tests.py                                | 97 +++++++++++++++++++
 .../example_conversion_config_template.yaml   | 22 +++++
 .../run_checkpoint_conversion.sh              | 25 +++++
 3 files changed, 144 insertions(+)
 create mode 100644 tutorials/getting_started/example_conversion_config_template.yaml
 create mode 100644 tutorials/getting_started/run_checkpoint_conversion.sh

diff --git a/tests/tests.py b/tests/tests.py
index 2c10e8679..3034f1b78 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -48,6 +48,86 @@ def check_existence_and_clear_getting_started_example_output(
     except OSError as e:
         print(f"Error: {e.filename} - {e.strerror}.")
 
+    # checkpoint converted
+    checkpoints_converted = [
+        join(output_directory_checkpoints, elem)
+        for elem in os.listdir(output_directory_checkpoints)
+        if elem.startswith("eid_")
+    ]
+    for checkpoint_converted in checkpoints_converted:
+        assert isdir(checkpoint_converted), f"ERROR! {checkpoint_converted} does not exist"
+        try:
+            shutil.rmtree(checkpoint_converted)
+            print(f"> removed {checkpoint_converted}")
+        except OSError as e:
+            print(f"Error: {e.filename} - {e.strerror}.")
+
+    # config converted
+    config_converted = join(run_getting_started_example_directory, "example_conversion_config.yaml")
+    assert isfile(config_converted), f"ERROR! {config_converted} does not exist"
+    try:
+        os.remove(config_converted)
+        print(f"> removed {config_converted}")
+    except OSError as e:
+        print(f"Error: {e.filename} - {e.strerror}.")
+
+
+def get_checkpoint_from_getting_started_example(run_getting_started_example_directory: str) -> str:
+    output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints")
+
+    checkpoint_directories = [
+        join(output_directory_checkpoints, elem)
+        for elem in os.listdir(output_directory_checkpoints)
+        if isdir(join(output_directory_checkpoints, elem))
+    ]
+    assert (
+        len(checkpoint_directories) == 1
+    ), f"ERROR! found {len(checkpoint_directories)} checkpoint directories for getting started example, expected 1."
+    checkpoint_directory = checkpoint_directories[0]
+
+    checkpoints = [
+        join(checkpoint_directory, elem)
+        for elem in os.listdir(checkpoint_directory)
+        if isfile(join(checkpoint_directory, elem))
+    ]
+    checkpoints = [elem for elem in checkpoints if "model" in elem and elem.endswith(".bin")]
+    assert (
+        len(checkpoints) == 1
+    ), f"ERROR! found {len(checkpoints)} checkpoints for getting started example, expected 1."
+    checkpoint = checkpoints[0]
+
+    return checkpoint
+
+
+def replace_checkpoint_in_conversion_config(
+    run_getting_started_example_directory: str, modalities_checkpoint: str
+) -> str:
+    # read example config
+    example_config = join(run_getting_started_example_directory, "example_config.yaml")
+    assert isfile(example_config), f"ERROR! could not find file at {example_config}"
+    with open(example_config, "r") as f:
+        lines = f.readlines()
+
+    # read conversion config template
+    conversion_config_template = join(run_getting_started_example_directory, "example_conversion_config_template.yaml")
+    assert isfile(conversion_config_template), f"ERROR! could not find file at {conversion_config_template}"
+    with open(conversion_config_template, "r") as f:
+        lines_additional = f.readlines()
+    lines += lines_additional
+
+    last_line_start = "    checkpoint_path:"
+    assert lines[-1].startswith(
+        last_line_start
+    ), f"ERROR! expected file at {conversion_config_template} to contain 'checkpoint_path' in last line."
+    lines[-1] = f"{last_line_start} {modalities_checkpoint}"
+
+    # write conversion config
+    conversion_config = join(run_getting_started_example_directory, "example_conversion_config.yaml")
+    with open(conversion_config, "w") as f:
+        for line in lines:
+            f.write(line)
+    return conversion_config
+
 
 def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):
     """
@@ -120,6 +200,23 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         date_of_run = datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
         subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)
 
+        # checkpoint conversion (based on getting started example)
+        print("\n=== RUN CHECKPOINT CONVERSION (BASED ON GETTING STARTED EXAMPLE) ===")
+        modalities_checkpoint = get_checkpoint_from_getting_started_example(run_getting_started_example_directory)
+        conversion_config_path = replace_checkpoint_in_conversion_config(
+            run_getting_started_example_directory, modalities_checkpoint
+        )
+
+        run_conversion_script = _ROOT_DIR / "tutorials" / "getting_started" / "run_checkpoint_conversion.sh"
+        assert isfile(run_conversion_script), f"ERROR! {run_conversion_script} does not exist."
+        command_conversion = f"cd {run_getting_started_example_directory}; "
+        command_conversion += f"sh run_checkpoint_conversion.sh {conversion_config_path} "
+        command_conversion += (
+            f"{run_getting_started_example_directory}/checkpoints/{modalities_checkpoint.split('/')[-1]}"
+        )
+        print(command_conversion)
+        subprocess.run(command_conversion, shell=True, capture_output=False, text=True)
+
         check_existence_and_clear_getting_started_example_output(run_getting_started_example_directory, date_of_run)
 
         # warmstart example
diff --git a/tutorials/getting_started/example_conversion_config_template.yaml b/tutorials/getting_started/example_conversion_config_template.yaml
new file mode 100644
index 000000000..92f602996
--- /dev/null
+++ b/tutorials/getting_started/example_conversion_config_template.yaml
@@ -0,0 +1,22 @@
+tokenizer:
+  component_key: tokenizer
+  variant_key: pretrained_hf_tokenizer
+  config:
+    pretrained_model_name_or_path: tokenizer
+    padding: false
+    truncation: false 
+
+checkpointed_model:
+  component_key: model
+  variant_key: checkpointed
+  config:
+    checkpoint_loading:
+      component_key: checkpoint_loading
+      variant_key: torch
+      config:
+        device: cpu
+        precision: BF16
+    model:
+      instance_key: model
+      pass_type: BY_REFERENCE
+    checkpoint_path: <CHECKPOINT_PATH>
\ No newline at end of file
diff --git a/tutorials/getting_started/run_checkpoint_conversion.sh b/tutorials/getting_started/run_checkpoint_conversion.sh
new file mode 100644
index 000000000..43492f8bf
--- /dev/null
+++ b/tutorials/getting_started/run_checkpoint_conversion.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+set -e 
+
+# ---------------------------------------------
+# bash run_checkpoint_conversion 
+# ---------------------------------------------
+
+#######################
+### INPUT ARGUMENTS ###
+#######################
+if [ -z "$1" ] || [ -z "$2" ]  # if one of the two input arguments does not exist
+  then
+    echo "Need to specify arguments, e.g. bash run_checkpoint_conversion modalities_config output_dir"
+    exit
+fi
+
+#############
+### RUN #####
+#############
+echo "> run checkpoint conversion"
+echo "python ../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2
+python ../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2
+
+echo "> test checkpoint conversion"
+# TODO

From c01a1fee2a2c4be5a1537f83be922418f9c938b9 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Mon, 24 Feb 2025 08:20:24 +0000
Subject: [PATCH 09/28] test(getting_started): move wandb directory to data (so
 that it gets removed)

---
 tests/tests.py                                | 9 +++++++++
 tutorials/getting_started/example_config.yaml | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/tests.py b/tests/tests.py
index 3034f1b78..7d97dfd60 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -30,6 +30,15 @@ def check_existence_and_clear_getting_started_example_output(
         except OSError as e:
             print(f"Error: {e.filename} - {e.strerror}.")
 
+    # wandb directory
+    output_directory_wandb = join(run_getting_started_example_directory, "data", "wandb_storage")
+    assert isdir(output_directory_wandb), f"ERROR! {output_directory_wandb} does not exist"
+    try:
+        shutil.rmtree(output_directory_wandb)
+        print(f"> removed {output_directory_wandb}")
+    except OSError as e:
+        print(f"Error: {e.filename} - {e.strerror}.")
+
     # checkpoint
     output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints")
     checkpoints = [elem for elem in os.listdir(output_directory_checkpoints) if elem.startswith("20")]
diff --git a/tutorials/getting_started/example_config.yaml b/tutorials/getting_started/example_config.yaml
index b5b3749be..f1b282101 100644
--- a/tutorials/getting_started/example_config.yaml
+++ b/tutorials/getting_started/example_config.yaml
@@ -294,5 +294,5 @@ evaluation_subscriber:
     project: modalities_getting_started
     mode: OFFLINE
     experiment_id: ${settings.experiment_id}
-    directory: wandb_storage
+    directory: data/wandb_storage
     config_file_path: ${settings.config_file_path}

From 85411be2a562da17cdb1dfbad77efcdbaa351749 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Mon, 24 Feb 2025 09:18:52 +0000
Subject: [PATCH 10/28] refactor(getting_started): refactoring (folder
 structure) and general update (README and configs)

---
 tests/tests.py                                | 19 ++++----
 tutorials/getting_started/README.md           | 47 +++++++++++--------
 .../{ => configs}/example_config.yaml         |  0
 .../example_conversion_config_template.yaml   |  0
 .../example_dataset_config_test.yaml          |  0
 .../example_dataset_config_train.yaml         |  0
 .../example_text_generation_config.yaml       | 29 +++++-------
 .../run_checkpoint_conversion.sh              |  4 +-
 .../run_getting_started_example.sh            |  6 +--
 9 files changed, 55 insertions(+), 50 deletions(-)
 rename tutorials/getting_started/{ => configs}/example_config.yaml (100%)
 rename tutorials/getting_started/{ => configs}/example_conversion_config_template.yaml (100%)
 rename tutorials/getting_started/{ => configs}/example_dataset_config_test.yaml (100%)
 rename tutorials/getting_started/{ => configs}/example_dataset_config_train.yaml (100%)
 rename tutorials/getting_started/{ => configs}/example_text_generation_config.yaml (78%)
 rename tutorials/getting_started/{ => scripts}/run_checkpoint_conversion.sh (79%)
 rename tutorials/getting_started/{ => scripts}/run_getting_started_example.sh (85%)

diff --git a/tests/tests.py b/tests/tests.py
index 7d97dfd60..b4fa2b5e5 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -72,7 +72,7 @@ def check_existence_and_clear_getting_started_example_output(
             print(f"Error: {e.filename} - {e.strerror}.")
 
     # config converted
-    config_converted = join(run_getting_started_example_directory, "example_conversion_config.yaml")
+    config_converted = join(run_getting_started_example_directory, "configs", "example_conversion_config.yaml")
     assert isfile(config_converted), f"ERROR! {config_converted} does not exist"
     try:
         os.remove(config_converted)
@@ -112,13 +112,15 @@ def replace_checkpoint_in_conversion_config(
     run_getting_started_example_directory: str, modalities_checkpoint: str
 ) -> str:
     # read example config
-    example_config = join(run_getting_started_example_directory, "example_config.yaml")
+    example_config = join(run_getting_started_example_directory, "configs", "example_config.yaml")
     assert isfile(example_config), f"ERROR! could not find file at {example_config}"
     with open(example_config, "r") as f:
         lines = f.readlines()
 
     # read conversion config template
-    conversion_config_template = join(run_getting_started_example_directory, "example_conversion_config_template.yaml")
+    conversion_config_template = join(
+        run_getting_started_example_directory, "configs", "example_conversion_config_template.yaml"
+    )
     assert isfile(conversion_config_template), f"ERROR! could not find file at {conversion_config_template}"
     with open(conversion_config_template, "r") as f:
         lines_additional = f.readlines()
@@ -131,7 +133,7 @@ def replace_checkpoint_in_conversion_config(
     lines[-1] = f"{last_line_start} {modalities_checkpoint}"
 
     # write conversion config
-    conversion_config = join(run_getting_started_example_directory, "example_conversion_config.yaml")
+    conversion_config = join(run_getting_started_example_directory, "configs", "example_conversion_config.yaml")
     with open(conversion_config, "w") as f:
         for line in lines:
             f.write(line)
@@ -197,14 +199,13 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         print("\n=== RUN GETTING STARTED EXAMPLE ===")
         run_getting_started_example_directory = _ROOT_DIR / "tutorials" / "getting_started"
         run_getting_started_example_script = (
-            _ROOT_DIR / "tutorials" / "getting_started" / "run_getting_started_example.sh"
+            _ROOT_DIR / "tutorials" / "getting_started" / "scripts" / "run_getting_started_example.sh"
         )
         assert isfile(
             run_getting_started_example_script
         ), f"ERROR! {run_getting_started_example_script} does not exist."
-        command_getting_started_example = (
-            f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]} {devices[1]}"
-        )
+        command_getting_started_example = f"cd {run_getting_started_example_directory}; "
+        command_getting_started_example += f"bash scripts/run_getting_started_example.sh {devices[0]} {devices[1]}"
         print(command_getting_started_example)
         date_of_run = datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
         subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)
@@ -216,7 +217,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
             run_getting_started_example_directory, modalities_checkpoint
         )
 
-        run_conversion_script = _ROOT_DIR / "tutorials" / "getting_started" / "run_checkpoint_conversion.sh"
+        run_conversion_script = _ROOT_DIR / "tutorials" / "getting_started" / "scripts" / "run_checkpoint_conversion.sh"
         assert isfile(run_conversion_script), f"ERROR! {run_conversion_script} does not exist."
         command_conversion = f"cd {run_getting_started_example_directory}; "
         command_conversion += f"sh run_checkpoint_conversion.sh {conversion_config_path} "
diff --git a/tutorials/getting_started/README.md b/tutorials/getting_started/README.md
index 45bb96870..11700863f 100644
--- a/tutorials/getting_started/README.md
+++ b/tutorials/getting_started/README.md
@@ -7,20 +7,27 @@ As a reference, this example has the following folder structure. Folders in <> w
 ```
 └── getting_started
     ├── checkpoints
-    │   └─ <checkpoint_folders>
-    ├── example_config.yaml
+    │   └── <checkpoint_folders>
+    ├── configs
+    │   ├── example_config.yaml
+    │   ├── example_conversion_config_template.yaml
+    │   ├── example_dataset_config_test.yaml
+    │   ├── example_dataset_config_train.yaml
+    │   └── example_text_generation_config.yaml
     ├── data
     │   ├── mem_map
-    │   ├── <preprocessed dataset files>
-    │   └── raw
-    │       ├── redpajama_v2_samples_512_test.jsonl
-    │       └── redpajama_v2_samples_512_train.jsonl
-    ├── getting_started_example.md
+    │   │   └── <preprocessed dataset files>
+    │   ├── raw
+    │   │   ├── redpajama_v2_samples_512_test.jsonl
+    │   │   └── redpajama_v2_samples_512_train.jsonl
+    │   └── <wandb_storage>
+    ├── scripts
+    │   ├── run_checkpoint_conversion.sh
+    │   └── run_getting_started_example.sh
     ├── tokenizer
-    │   ├── tokenizer.json
-    │   └── tokenizer_config.json
-    └── wandb
-        └── <wandb_logs>
+    │   ├── tokenizer_config.json
+    │   └── tokenizer.json
+    └── README.md
 ```
 
 ## 1. Preprocessing
@@ -40,7 +47,7 @@ The two raw dataset splits for training and evaluation can be found in
 and need to be preprocessed into the [MemMap dataset format](https://github.com/Modalities/modalities/blob/main/src/modalities/dataloader/dataset.py). 
 
 ### Config File
-To do so, we employ the `example_dataset_config_train.yaml` and `example_dataset_config_test.yaml` configuration files, which contain the paths of the input and output files, the path of the tokenizer as well as some configurable parameters:
+To do so, we employ the `configs/example_dataset_config_train.yaml` and `configs/example_dataset_config_test.yaml` configuration files, which contain the paths of the input and output files, the path of the tokenizer as well as some configurable parameters:
 ```yaml
 # example_dataset_config_train.yaml
 
@@ -88,10 +95,10 @@ After having determined the index, we create the packed dataset as described bel
 
 ```sh
 # train split
-modalities data pack_encoded_data example_dataset_config_train.yaml
+modalities data pack_encoded_data configs/example_dataset_config_train.yaml
 
 # test split
-modalities data pack_encoded_data example_dataset_config_test.yaml
+modalities data pack_encoded_data configs/example_dataset_config_test.yaml
 ```
 This will create the following file structure which can we can directly load into the [PackedMemMapdataset](https://github.com/Modalities/modalities/blob/main/src/modalities/dataloader/dataset.py#L65).
 ```
@@ -148,7 +155,7 @@ first and then divides it into chunks of size context-length.
 ### Config File
 In Modalities, we describe the entire training and evaluation setup (i.e., components such as model, trainer, evaluator, dataloder etc.) within a single configuration file. Not only does this increase reproducibility but also allows for having the entire training runs under version control. A full list of all the components already available in modalities an be found [here](../../docs/components/components.md).
 
-The example config file for this experiment can be found in `tutorials/getting_started/example_config.yaml`. 
+The example config file for this experiment can be found in `tutorials/getting_started/configs/example_config.yaml`. 
 
 ### Training
 Having created the dataset and defined the experiment in the configuration file, we can already start the training by running the following command.
@@ -157,7 +164,7 @@ Having created the dataset and defined the experiment in the configuration file,
 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --rdzv-endpoint localhost:29505 \
                                               --nnodes 1 \
                                               --nproc_per_node 8 \
-                                              $(which modalities) run --config_file_path example_config.yaml
+                                              $(which modalities) run --config_file_path configs/example_config.yaml
 ```
 
 The command can be broken down into the following parts:
@@ -183,15 +190,15 @@ The command can be broken down into the following parts:
 7. **`run`**:
    - Command argument for the `modalities` executable to initiate the training.
 
-8. **`--config_file_path example_config.yaml`**:
-   - Specifies the path to the configuration file. The file `example_config.yaml` contains the configuration of the components, including dataset and model configurations, training parameters, etc.
+8. **`--config_file_path configs/example_config.yaml`**:
+   - Specifies the path to the configuration file. The file `configs/example_config.yaml` contains the configuration of the components, including dataset and model configurations, training parameters, etc.
 
 
 Already during the training, the checkpoints can be found locally in `checkpoints/` and the loss and metric developments can be inspected online in [Weights&Biases](https://wandb.ai/). 
 
 ### Evaluation
 
-In order to let the model generate text, we need to specify the last training checkpoint under `model_path` in the config file `example_text_generation_config.yaml`:
+In order to let the model generate text, we need to specify the last training checkpoint under `model_path` in the config file `configs/example_text_generation_config.yaml`:
 
 ```
 # example_text_generation_config.yaml
@@ -210,7 +217,7 @@ settings:
 Subsequently, given the checkpoint and tokenizer, we can load the model for text generation as follows:
 
 ```sh
-modalities generate_text --config_file_path example_text_generation_config.yaml 
+modalities generate_text --config_file_path configs/example_text_generation_config.yaml 
 ```
 
 This opens an interactive chatting CMD interface.
diff --git a/tutorials/getting_started/example_config.yaml b/tutorials/getting_started/configs/example_config.yaml
similarity index 100%
rename from tutorials/getting_started/example_config.yaml
rename to tutorials/getting_started/configs/example_config.yaml
diff --git a/tutorials/getting_started/example_conversion_config_template.yaml b/tutorials/getting_started/configs/example_conversion_config_template.yaml
similarity index 100%
rename from tutorials/getting_started/example_conversion_config_template.yaml
rename to tutorials/getting_started/configs/example_conversion_config_template.yaml
diff --git a/tutorials/getting_started/example_dataset_config_test.yaml b/tutorials/getting_started/configs/example_dataset_config_test.yaml
similarity index 100%
rename from tutorials/getting_started/example_dataset_config_test.yaml
rename to tutorials/getting_started/configs/example_dataset_config_test.yaml
diff --git a/tutorials/getting_started/example_dataset_config_train.yaml b/tutorials/getting_started/configs/example_dataset_config_train.yaml
similarity index 100%
rename from tutorials/getting_started/example_dataset_config_train.yaml
rename to tutorials/getting_started/configs/example_dataset_config_train.yaml
diff --git a/tutorials/getting_started/example_text_generation_config.yaml b/tutorials/getting_started/configs/example_text_generation_config.yaml
similarity index 78%
rename from tutorials/getting_started/example_text_generation_config.yaml
rename to tutorials/getting_started/configs/example_text_generation_config.yaml
index 714d05148..3caad5c0a 100644
--- a/tutorials/getting_started/example_text_generation_config.yaml
+++ b/tutorials/getting_started/configs/example_text_generation_config.yaml
@@ -2,7 +2,7 @@ settings:
   referencing_keys:
     sample_key: input_ids
     prediction_key: logits
-  model_path: ./checkpoints/2024-06-27__14-17-52/eid_2024-06-27__14-17-52-model-num_steps_48-num_tokens_393216.bin
+  model_path: ./checkpoints/2025-02-24__08-53-31_5b6cf982/eid_2025-02-24__08-53-31_5b6cf982-model-seen_steps_48-seen_tokens_393216-target_steps_95-target_tokens_778240.bin
   device: 0
   sequence_length: 1024
 
@@ -53,7 +53,7 @@ model:
     ffn_hidden: 128
     n_embd: 128
     dropout: 0.0
-    bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
+    bias: false
     attention_config:
       qkv_transforms:
         - type_hint: RotaryTransform
@@ -62,29 +62,26 @@ model:
             n_head: ${model.config.n_head_q} #it has to be head_q here
             seq_length_dim: -2
             base_freq: 10000
-    attention_implementation: manual
-    activation_type: gelu
+    attention_implementation: pytorch_flash
+    activation_type: swiglu
     attention_norm:
       component_key: layer_norm
-      variant_key: rms_norm
+      variant_key: layer_norm
       config:
-        ndim: ${model.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model.config.n_embd}
+        eps: 1.0e-05
     ffn_norm:
       component_key: layer_norm
-      variant_key: rms_norm
+      variant_key: layer_norm
       config:
-        ndim: ${model.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model.config.n_embd}
+        eps: 1.0e-05
     lm_head_norm:
       component_key: layer_norm
-      variant_key: rms_norm
+      variant_key: layer_norm
       config:
-        ndim: ${model.config.n_embd}
-        bias: true
-        epsilon: 1e-5
+        normalized_shape: ${model.config.n_embd}
+        eps: 1.0e-05
 
 tokenizer:
   component_key: tokenizer
diff --git a/tutorials/getting_started/run_checkpoint_conversion.sh b/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
similarity index 79%
rename from tutorials/getting_started/run_checkpoint_conversion.sh
rename to tutorials/getting_started/scripts/run_checkpoint_conversion.sh
index 43492f8bf..452a15d3a 100644
--- a/tutorials/getting_started/run_checkpoint_conversion.sh
+++ b/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
@@ -18,8 +18,8 @@ fi
 ### RUN #####
 #############
 echo "> run checkpoint conversion"
-echo "python ../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2
-python ../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2
+echo "python ../../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2
+python ../../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2
 
 echo "> test checkpoint conversion"
 # TODO
diff --git a/tutorials/getting_started/run_getting_started_example.sh b/tutorials/getting_started/scripts/run_getting_started_example.sh
similarity index 85%
rename from tutorials/getting_started/run_getting_started_example.sh
rename to tutorials/getting_started/scripts/run_getting_started_example.sh
index 30555256a..5e56d5142 100644
--- a/tutorials/getting_started/run_getting_started_example.sh
+++ b/tutorials/getting_started/scripts/run_getting_started_example.sh
@@ -29,6 +29,6 @@ echo "> run getting_started_examples on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVI
 
 modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_train.idx data/raw/redpajama_v2_samples_512_train.jsonl
 modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_test.idx data/raw/redpajama_v2_samples_512_test.jsonl
-modalities data pack_encoded_data example_dataset_config_train.yaml
-modalities data pack_encoded_data example_dataset_config_test.yaml
-CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path example_config.yaml
+modalities data pack_encoded_data configs/example_dataset_config_train.yaml
+modalities data pack_encoded_data configs/example_dataset_config_test.yaml
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path configs/example_config.yaml

From 262f06c95df0b66f92173e91267b2f36c042798b Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Mon, 24 Feb 2025 11:32:15 +0100
Subject: [PATCH 11/28] fix(huggingface): Added missing import.

---
 src/modalities/conversion/gpt2/convert_gpt2.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index c22888434..b7f46ac8c 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -19,6 +19,7 @@
 """
 
 import argparse
+import os
 import shutil
 
 import torch
@@ -220,8 +221,6 @@ def _transfer_model_code(output_dir: str):
 
 
 if __name__ == "__main__":
-    import os
-
     os.environ["LOCAL_RANK"] = "0"
     os.environ["WORLD_SIZE"] = "1"
     os.environ["RANK"] = "0"

From 6564452db5505d3c061ef9236ea9afa76f262168 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Mon, 24 Feb 2025 11:35:59 +0100
Subject: [PATCH 12/28] refactor(huggingface): Minor code improvement.

---
 src/modalities/conversion/gpt2/convert_gpt2.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index b7f46ac8c..6f244a632 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -198,7 +198,7 @@ def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.
 
 
 def _transfer_model_code(output_dir: str):
-    """Copies the required model code to the output directory.
+    """Copies the required model code to the output directory and replaces modalities imports.
        This allows the converted model to be used without the modalities package via:
        >>> from transformers import AutoModelForCausalLM
        >>> model = AutoModelForCausalLM.from_pretrained("path/to/converted/model", trust_remote_code=True)
@@ -206,12 +206,19 @@ def _transfer_model_code(output_dir: str):
     Args:
         output_dir (str): Directory of the converted model.
     """
+    _copy_model_files(output_dir)
+    _change_modalities_import_to_relative_import(output_dir)
+
+
+def _copy_model_files(output_dir: str):
     source_dir = os.path.dirname(__file__)
     modeling_gpt2_path = os.path.join(source_dir, "modeling_gpt2.py")
     configuration_gpt2_path = os.path.join(source_dir, "configuration_gpt2.py")
     shutil.copy(modeling_gpt2_path, output_dir)
     shutil.copy(configuration_gpt2_path, output_dir)
 
+
+def _change_modalities_import_to_relative_import(output_dir: str):
     target_modeling_file = os.path.join(output_dir, "modeling_gpt2.py")
     with open(target_modeling_file, "r") as file:
         content = file.read()

From f7e558c53086da1e90b497a3d0f7931df6e51e5e Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Mon, 24 Feb 2025 10:45:27 +0000
Subject: [PATCH 13/28] fix(getting_started): capture subprocess errors
 properly + bug fix

---
 tests/tests.py                                | 25 +++++++++++--------
 .../scripts/run_checkpoint_conversion.sh      |  4 +--
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index b4fa2b5e5..4b387f311 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -140,6 +140,14 @@ def replace_checkpoint_in_conversion_config(
     return conversion_config
 
 
+def subprocess_run(command: str) -> None:
+    print(command)
+    try:
+        subprocess.run(command, shell=True, capture_output=False, check=True, text=True)
+    except subprocess.CalledProcessError:
+        raise Exception("SUBPROCESS RUN FAILED.")
+
+
 def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):
     """
     Run tests on cpu, single gpu and multiple gpus
@@ -179,8 +187,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         command_unit_tests = (
             f"cd {_ROOT_DIR} && CUDA_VISIBLE_DEVICES={devices[0] if single_gpu else None} python -m pytest"
         )
-        print(command_unit_tests)
-        subprocess.run(command_unit_tests, shell=True, capture_output=False, text=True)
+        subprocess_run(command_unit_tests)
 
     # run multi-gpu tests
     if multi_gpu:
@@ -192,8 +199,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         command_end_to_end_tests = (
             f"cd {run_distributed_tests_directory}; bash run_distributed_tests.sh {devices[0]} {devices[1]} --no-cov"
         )
-        print(command_end_to_end_tests)
-        subprocess.run(command_end_to_end_tests, shell=True, capture_output=False, text=True)
+        subprocess_run(command_end_to_end_tests)
 
         # getting started example
         print("\n=== RUN GETTING STARTED EXAMPLE ===")
@@ -206,9 +212,8 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         ), f"ERROR! {run_getting_started_example_script} does not exist."
         command_getting_started_example = f"cd {run_getting_started_example_directory}; "
         command_getting_started_example += f"bash scripts/run_getting_started_example.sh {devices[0]} {devices[1]}"
-        print(command_getting_started_example)
         date_of_run = datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
-        subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)
+        subprocess_run(command_getting_started_example)
 
         # checkpoint conversion (based on getting started example)
         print("\n=== RUN CHECKPOINT CONVERSION (BASED ON GETTING STARTED EXAMPLE) ===")
@@ -220,12 +225,11 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         run_conversion_script = _ROOT_DIR / "tutorials" / "getting_started" / "scripts" / "run_checkpoint_conversion.sh"
         assert isfile(run_conversion_script), f"ERROR! {run_conversion_script} does not exist."
         command_conversion = f"cd {run_getting_started_example_directory}; "
-        command_conversion += f"sh run_checkpoint_conversion.sh {conversion_config_path} "
+        command_conversion += f"sh scripts/run_checkpoint_conversion.sh {conversion_config_path} "
         command_conversion += (
             f"{run_getting_started_example_directory}/checkpoints/{modalities_checkpoint.split('/')[-1]}"
         )
-        print(command_conversion)
-        subprocess.run(command_conversion, shell=True, capture_output=False, text=True)
+        subprocess_run(command_conversion)
 
         check_existence_and_clear_getting_started_example_output(run_getting_started_example_directory, date_of_run)
 
@@ -237,8 +241,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         command_warmstart_example = (
             f"cd {run_warmstart_example_directory}; sh pre_train_and_warmstart.sh {devices[0]} {devices[1]}"
         )
-        print(command_warmstart_example)
-        subprocess.run(command_warmstart_example, shell=True, capture_output=False, text=True)
+        subprocess_run(command_warmstart_example)
 
     print("\n=== DONE ===")
 
diff --git a/tutorials/getting_started/scripts/run_checkpoint_conversion.sh b/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
index 452a15d3a..43492f8bf 100644
--- a/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
+++ b/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
@@ -18,8 +18,8 @@ fi
 ### RUN #####
 #############
 echo "> run checkpoint conversion"
-echo "python ../../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2
-python ../../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2
+echo "python ../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2
+python ../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2
 
 echo "> test checkpoint conversion"
 # TODO

From 01c75ecc140e887a44ebb2c096107f0de0e93207 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Mon, 24 Feb 2025 11:33:50 +0000
Subject: [PATCH 14/28] test(conversion): check checkpoint conversion based on
 getting started example

---
 src/modalities/conversion/gpt2/convert_gpt2.py             | 2 +-
 .../getting_started/scripts/run_checkpoint_conversion.sh   | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index 6f244a632..e6277760a 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -143,7 +143,7 @@ def test_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, n
         vocab_size (int): Vocabulary size of the model. (Required for generating random input tokens.)
     """
     for _ in tqdm(range(num_testruns), desc="Testing converted model"):
-        input_ids = torch.randint(0, vocab_size, (1, 1024), device=hf_model.device)
+        input_ids = torch.randint(0, vocab_size, (1, modalities_model.sequence_length), device=hf_model.device)
         inputs = {modalities_model.sample_key: input_ids.to(modalities_model.transformer.wte.weight.device)}
 
         with torch.no_grad():
diff --git a/tutorials/getting_started/scripts/run_checkpoint_conversion.sh b/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
index 43492f8bf..61f0fd5b3 100644
--- a/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
+++ b/tutorials/getting_started/scripts/run_checkpoint_conversion.sh
@@ -18,8 +18,5 @@ fi
 ### RUN #####
 #############
 echo "> run checkpoint conversion"
-echo "python ../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2
-python ../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2
-
-echo "> test checkpoint conversion"
-# TODO
+echo "python ../../src/modalities/conversion/gpt2/convert_gpt2.py" $1 $2 "--num_testruns 5"
+python ../../src/modalities/conversion/gpt2/convert_gpt2.py $1 $2 --num_testruns 5

From b1197261b2461be491d71a6d4794407cec26acb2 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Mon, 24 Feb 2025 18:09:52 +0100
Subject: [PATCH 15/28] feat(huggingface): Added handling of additional config
 settings to conversion script.

---
 .../conversion/gpt2/convert_gpt2.py           | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index e6277760a..b8cf92c30 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -42,7 +42,7 @@ def convert_gpt2(
     device_hf: str = "cpu",
 ) -> None:
     """Takes a modalities gpt2 model and converts it to a Huggingface transformers model.
-       The provided config yaml file should contain the model_raw section with the model configuration.
+       The provided config yaml file should contain the model_raw or model section with the model configuration.
        Additionally, the checkpointed_model section should be present and contain the path to the model checkpoint.
        Optionally, the function can run a number of test runs to compare the converted model with the original one.
 
@@ -57,11 +57,11 @@ def convert_gpt2(
     hf_model, modalities_model = convert_model_checkpoint(modalities_config)
 
     if num_testruns > 0:
-        test_converted_model(
+        check_converted_model(
             hf_model.to(device_hf),
             modalities_model.to(device_modalities),
             num_testruns,
-            modalities_config["model_raw"]["config"]["vocab_size"],
+            modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"],
         )
 
     hf_model.config.auto_map = {
@@ -93,7 +93,7 @@ def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM,
 
 def convert_model_config(modalities_config: dict) -> GPT2Config:
     """Converts the modalities model configuration to a Huggingface transformers configuration.
-       For this the model_raw section of the modalities config is used.
+       For this the model_raw or model section of the modalities config is used.
        Corresponding entries are mapped to the Huggingface configuration.
 
     Args:
@@ -102,11 +102,22 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
     Returns:
         GPT2Config: Converted Huggingface model configuration.
     """
-    config = modalities_config["model_raw"]["config"]
+    config = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]
 
     assert config["poe_type"] == PositionTypes.NOPE
     assert config["activation_type"] == "swiglu"
 
+    assert config["attention_norm"]["variant_key"] == "layer_norm"
+    assert config["ffn_norm"]["variant_key"] == "layer_norm"
+    assert config["lm_head_norm"]["variant_key"] == "layer_norm"
+
+    if config["attention_implementation"] == "pytorch_flash":
+        attention_impl = "sdpa"
+    elif config["attention_implementation"] == "manual":
+        attention_impl = "eager"
+    else:
+        raise ValueError(f"Unknown or unsupported attention implementation {config['attention_implementation']}.")
+
     return GPT2Config(
         vocab_size=config["vocab_size"],
         hidden_size=config["n_embd"],
@@ -115,6 +126,7 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
         num_key_value_heads=config["n_head_kv"],
         num_attention_heads=config["n_head_q"],
         intermediate_size=SwiGLU._get_hidden_dim(ffn_hidden=config["ffn_hidden"]),
+        attention_bias=config["bias"],
         mlp_bias=config["bias"],
         hidden_act="silu",
         layer_norm_eps=config["ffn_norm"]["config"]["eps"],
@@ -128,12 +140,12 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
         layer_norm_bias=config["ffn_norm"]["config"].get("bias", True),  # TODO: see comment above
         max_position_embeddings=config["sequence_length"],
         rope_theta=config["attention_config"]["qkv_transforms"][0]["config"]["base_freq"],
-        _attn_implementation="sdpa",
+        _attn_implementation=attention_impl,
         output_attentions=False,
     )
 
 
-def test_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, num_testruns: int, vocab_size: int):
+def check_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, num_testruns: int, vocab_size: int):
     """Tests the converted model by inputting a random token sequence and comparing the output logits of both models.
 
     Args:

From 286b37f13dcdddbc18ca89867f2c4a7695cb778f Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Mon, 24 Feb 2025 18:12:43 +0100
Subject: [PATCH 16/28] test(huggingface): Added additional tests for
 conversion functions.

TODO: test_converting_gpt2_does_not_change_outputs currently fails.
---
 tests/conversion/gpt2/test_convert_gpt2.py    | 173 +++++++++++++++++-
 .../test_configs/gpt2_config_test.yaml        |  62 +++++++
 2 files changed, 230 insertions(+), 5 deletions(-)
 create mode 100644 tests/conversion/test_configs/gpt2_config_test.yaml

diff --git a/tests/conversion/gpt2/test_convert_gpt2.py b/tests/conversion/gpt2/test_convert_gpt2.py
index 2062663b1..e8c538a2b 100644
--- a/tests/conversion/gpt2/test_convert_gpt2.py
+++ b/tests/conversion/gpt2/test_convert_gpt2.py
@@ -1,13 +1,49 @@
+import os
+import shutil
+from pathlib import Path
+
+import pytest
 import torch
 import torch.nn as nn
-from modalities.conversion.gpt2.convert_gpt2 import _copy_weights_base_modules
+from transformers import AutoModelForCausalLM
+
+from modalities.config.config import load_app_config_dict
+from modalities.conversion.gpt2.convert_gpt2 import (
+    _copy_weights_base_modules,
+    _transfer_model_code,
+    check_converted_model,
+    convert_gpt2,
+)
+from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
+from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
+from modalities.models.utils import ModelTypeEnum, get_model_from_config
+from tests.conftest import _ROOT_DIR
+
+
+def test_converting_gpt2_does_not_change_weights(tmp_path: Path, gpt2_config_path: str):
+    output_dir = tmp_path / "output"
+    convert_gpt2(gpt2_config_path, output_dir)
+    modalities_config = load_app_config_dict(gpt2_config_path)
+    original_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
+    converted_model = AutoModelForCausalLM.from_pretrained(output_dir, local_files_only=True, trust_remote_code=True)
+    check_same_weight_model(converted_model, original_model)
+
+
+def test_converting_gpt2_does_not_change_outputs(tmp_path: Path, gpt2_config_path: str):
+    output_dir = tmp_path / "output"
+    convert_gpt2(gpt2_config_path, output_dir)
+    modalities_config = load_app_config_dict(gpt2_config_path)
+    original_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
+    converted_model = AutoModelForCausalLM.from_pretrained(output_dir, local_files_only=True, trust_remote_code=True)
+    vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
+    check_converted_model(converted_model, original_model, 1, vocab_size)
 
 
 def test_copying_base_modules_weights_yields_identical_modules():
     m1 = nn.Linear(10, 10, bias=True)
     m2 = nn.Linear(10, 10, bias=True)
-    m1.weight.data = torch.randn(10, 10)
-    m1.bias.data = torch.randn(10)
+    m2.weight.data = torch.randn(10, 10)
+    m2.bias.data = torch.randn(10)
 
     _copy_weights_base_modules(m1, m2)
 
@@ -18,9 +54,136 @@ def test_copying_base_modules_weights_yields_identical_modules():
 def test_copying_base_modules_works_when_bias_is_false():
     m1 = nn.Linear(10, 10, bias=False)
     m2 = nn.Linear(10, 10, bias=False)
-    m1.weight.data = torch.randn(10, 10)
+    m2.weight.data = torch.randn(10, 10)
 
     _copy_weights_base_modules(m1, m2)
 
     assert torch.equal(m1.weight.data, m2.weight.data)
-    assert m1.bias == m2.bias == None
+    assert m1.bias == m2.bias and m2.bias is None
+
+
+def test_copying_base_modules_fails_if_bias_settings_mismatch():
+    m1 = nn.Linear(10, 10, bias=False)
+    m2 = nn.Linear(10, 10, bias=True)
+    m2.weight.data = torch.randn(10, 10)
+    m2.bias.data = torch.randn(10)
+
+    with pytest.raises(AttributeError):
+        _copy_weights_base_modules(m1, m2)
+
+
+def test_modeling_gpt2_gets_transferred_with_model_files(tmp_path: Path):
+    modeling_gpt2_path = tmp_path / "modeling_gpt2.py"
+    assert not modeling_gpt2_path.exists()
+    _transfer_model_code(tmp_path)
+    assert modeling_gpt2_path.exists()
+
+
+def test_configuration_gpt2_gets_transferred_with_model_files(tmp_path: Path):
+    configuration_gpt2_path = tmp_path / "configuration_gpt2.py"
+    assert not configuration_gpt2_path.exists()
+    _transfer_model_code(tmp_path)
+    assert configuration_gpt2_path.exists()
+
+
+def test_transferred_modeling_gpt2_does_not_import_from_modalities(tmp_path: Path):
+    _transfer_model_code(tmp_path)
+    with open(tmp_path / "modeling_gpt2.py") as f:
+        text = f.read()
+        assert "from modalities" not in text
+        assert "import modalities" not in text
+
+
+def test_transferred_configuration_gpt2_does_not_import_from_modalities(tmp_path: Path):
+    _transfer_model_code(tmp_path)
+    with open(tmp_path / "configuration_gpt2.py") as f:
+        text = f.read()
+        assert "from modalities" not in text
+        assert "import modalities" not in text
+
+
+@pytest.fixture()
+def gpt2_config_path(tmp_path: Path, initialized_model: GPT2LLM, config_file_path: str) -> str:
+    new_config_filename = tmp_path / "gpt2_config_test.yaml"
+    model_path = tmp_path / "model.pth"
+    shutil.copy(config_file_path, new_config_filename)
+    torch.save(initialized_model.state_dict(), model_path)
+    with open(new_config_filename, "r") as file:
+        content = file.read()
+    content = content.replace("checkpoint_path: null", f"checkpoint_path: {model_path}")
+    with open(new_config_filename, "w") as file:
+        file.write(content)
+    return str(new_config_filename)
+
+
+@pytest.fixture()
+def initialized_model(set_env, config_dict: dict) -> GPT2LLM:
+    model = get_model_from_config(config=config_dict, model_type=ModelTypeEnum.MODEL)
+    assert isinstance(model, GPT2LLM)
+    return model
+
+
+@pytest.fixture()
+def set_env():
+    os.environ["LOCAL_RANK"] = "0"
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+
+
+@pytest.fixture()
+def config_dict(config_file_path: Path) -> dict:
+    return load_app_config_dict(config_file_path=config_file_path)
+
+
+@pytest.fixture()
+def config_file_path(config_file_name: str) -> Path:
+    config_file_path = _ROOT_DIR / Path("tests/conversion/test_configs/" + config_file_name)
+    return config_file_path
+
+
+@pytest.fixture(params=["gpt2_config_test.yaml"])
+def config_file_name(request) -> str:
+    return request.param
+
+
+@pytest.fixture
+def device() -> str:
+    return "cpu"
+
+
+def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
+    converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device)
+    assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight)
+    for i, (llama_layer, modalities_layer) in enumerate(
+        zip(converted_model.model.layers, modalities_model.transformer.h)
+    ):
+        check_same_weight_attention(llama_layer, modalities_layer)
+        check_same_weight_mlp(llama_layer, modalities_layer)
+        check_same_weight_layer_norms(llama_layer, modalities_layer)
+    check_same_weight_base_modules(converted_model.lm_head, modalities_model.lm_head)
+    check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm)
+
+
+def check_same_weight_attention(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
+
+
+def check_same_weight_mlp(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.mlp.down_proj, modalities_layer.mlp.W_2)
+    check_same_weight_base_modules(llama_layer.mlp.gate_proj, modalities_layer.mlp.W)
+    check_same_weight_base_modules(llama_layer.mlp.up_proj, modalities_layer.mlp.V)
+
+
+def check_same_weight_layer_norms(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.input_layernorm, modalities_layer.attention_norm)
+    check_same_weight_base_modules(llama_layer.post_attention_layernorm, modalities_layer.ffn_norm)
+
+
+def check_same_weight_base_modules(l1: nn.Linear | nn.LayerNorm, l2: nn.Linear | nn.LayerNorm):
+    assert torch.equal(l1.weight, l2.weight)
+    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
+    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
+    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
diff --git a/tests/conversion/test_configs/gpt2_config_test.yaml b/tests/conversion/test_configs/gpt2_config_test.yaml
new file mode 100644
index 000000000..7d03fdb8c
--- /dev/null
+++ b/tests/conversion/test_configs/gpt2_config_test.yaml
@@ -0,0 +1,62 @@
+model:
+  component_key: model
+  variant_key: gpt2
+  config:
+    sample_key: input_ids
+    poe_type: NOPE
+    sequence_length: 256
+    prediction_key: logits
+    vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: 6
+    n_head_q: 12
+    n_head_kv: 12
+    ffn_hidden: 2048
+    n_embd: 768
+    dropout: 0.0
+    bias: false # true # True: bias in Linears, like GPT-2. False: a bit better and faster
+    attention_config:
+      qkv_transforms:
+      - type_hint: RotaryTransform
+        config:
+          n_embd: ${model.config.n_embd}
+          n_head: ${model.config.n_head_q} #it has to be head_q here
+          seq_length_dim: -2
+          base_freq: 500000
+    attention_implementation: pytorch_flash # manual
+    activation_type: swiglu
+    attention_norm:
+      component_key: layer_norm
+      variant_key: layer_norm
+      config:
+        normalized_shape: ${model.config.n_embd}
+        eps: 1e-5
+        # bias: true
+    ffn_norm:
+      component_key: layer_norm
+      variant_key: layer_norm
+      config:
+        normalized_shape: ${model.config.n_embd}
+        eps: 1e-5
+        # bias: true
+    lm_head_norm:
+      component_key: layer_norm
+      variant_key: layer_norm
+      config:
+        normalized_shape: ${model.config.n_embd}
+        eps: 1e-5
+        # bias: true
+
+checkpointed_model:
+  component_key: model
+  variant_key: checkpointed
+  config:
+    checkpoint_loading:
+      component_key: checkpoint_loading
+      variant_key: torch
+      config:
+        device: cpu
+        precision: BF16
+    model:
+      instance_key: model
+      pass_type: BY_REFERENCE
+    checkpoint_path: null
\ No newline at end of file

From 39485753ea7c536529bcc4f8b3e703699be6b38f Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 25 Feb 2025 10:39:28 +0100
Subject: [PATCH 17/28] fix(checkpointing): Fixed warning.

---
 .../checkpointing/torch/torch_checkpoint_loading.py           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/modalities/checkpointing/torch/torch_checkpoint_loading.py b/src/modalities/checkpointing/torch/torch_checkpoint_loading.py
index c29cea27c..14d2800ac 100644
--- a/src/modalities/checkpointing/torch/torch_checkpoint_loading.py
+++ b/src/modalities/checkpointing/torch/torch_checkpoint_loading.py
@@ -45,13 +45,13 @@ def load_model_checkpoint(self, model: nn.Module, file_path: Path) -> nn.Module:
         else:
             model = model.to(self.device)
 
-        model_state = torch.load(file_path, map_location=self.device)
+        model_state = torch.load(file_path, map_location=self.device, weights_only=False)
         model_state_dtype = list(model_state.values())[0].dtype
 
         if self.precision is not None and self.precision.value != model_state_dtype:
             warning(
                 f"WARNING: Model checkpoint was stored with precision {model_state_dtype} "
-                "but is loaded with precision {self.precision.value}."
+                f"but is loaded with precision {self.precision.value}."
             )
 
         # assign=True makes sure that the model is loaded with the same precision

From 8991797f096a40c98fa69b2139fdb0c00954a4a3 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 25 Feb 2025 10:47:50 +0100
Subject: [PATCH 18/28] test(huggingface): Fixed model output comparison test.

---
 tests/conversion/gpt2/test_convert_gpt2.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/conversion/gpt2/test_convert_gpt2.py b/tests/conversion/gpt2/test_convert_gpt2.py
index e8c538a2b..ff5f1a3cd 100644
--- a/tests/conversion/gpt2/test_convert_gpt2.py
+++ b/tests/conversion/gpt2/test_convert_gpt2.py
@@ -34,7 +34,9 @@ def test_converting_gpt2_does_not_change_outputs(tmp_path: Path, gpt2_config_pat
     convert_gpt2(gpt2_config_path, output_dir)
     modalities_config = load_app_config_dict(gpt2_config_path)
     original_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
-    converted_model = AutoModelForCausalLM.from_pretrained(output_dir, local_files_only=True, trust_remote_code=True)
+    converted_model = AutoModelForCausalLM.from_pretrained(
+        output_dir, local_files_only=True, trust_remote_code=True
+    ).to(dtype=torch.bfloat16)
     vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
     check_converted_model(converted_model, original_model, 1, vocab_size)
 

From e91633d1baf3996977a3ef791f3e9194e5dcc86c Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 25 Feb 2025 12:37:07 +0100
Subject: [PATCH 19/28] refactor(huggingface): Split conversion logic into
 multiple files.

---
 .../conversion/gpt2/conversion_code.py        |  32 ++++
 .../conversion/gpt2/conversion_model.py       | 145 ++++++++++++++
 .../conversion/gpt2/convert_gpt2.py           | 181 +-----------------
 tests/conversion/gpt2/conftest.py             | 100 ++++++++++
 tests/conversion/gpt2/test_conversion_code.py |  33 ++++
 .../conversion/gpt2/test_conversion_model.py  |  38 ++++
 tests/conversion/gpt2/test_convert_gpt2.py    | 124 +-----------
 7 files changed, 354 insertions(+), 299 deletions(-)
 create mode 100644 src/modalities/conversion/gpt2/conversion_code.py
 create mode 100644 src/modalities/conversion/gpt2/conversion_model.py
 create mode 100644 tests/conversion/gpt2/conftest.py
 create mode 100644 tests/conversion/gpt2/test_conversion_code.py
 create mode 100644 tests/conversion/gpt2/test_conversion_model.py

diff --git a/src/modalities/conversion/gpt2/conversion_code.py b/src/modalities/conversion/gpt2/conversion_code.py
new file mode 100644
index 000000000..5f2b36c05
--- /dev/null
+++ b/src/modalities/conversion/gpt2/conversion_code.py
@@ -0,0 +1,32 @@
+import os
+import shutil
+
+
+def _copy_model_files(output_dir: str):
+    source_dir = os.path.dirname(__file__)
+    modeling_gpt2_path = os.path.join(source_dir, "modeling_gpt2.py")
+    configuration_gpt2_path = os.path.join(source_dir, "configuration_gpt2.py")
+    shutil.copy(modeling_gpt2_path, output_dir)
+    shutil.copy(configuration_gpt2_path, output_dir)
+
+
+def _change_modalities_import_to_relative_import(output_dir: str):
+    target_modeling_file = os.path.join(output_dir, "modeling_gpt2.py")
+    with open(target_modeling_file, "r") as file:
+        content = file.read()
+    content = content.replace("modalities.conversion.gpt2.configuration_gpt2", ".configuration_gpt2")
+    with open(target_modeling_file, "w") as file:
+        file.write(content)
+
+
+def transfer_model_code(output_dir: str):
+    """Copies the required model code to the output directory and replaces modalities imports.
+       This allows the converted model to be used without the modalities package via:
+       >>> from transformers import AutoModelForCausalLM
+       >>> model = AutoModelForCausalLM.from_pretrained("path/to/converted/model", trust_remote_code=True)
+
+    Args:
+        output_dir (str): Directory of the converted model.
+    """
+    _copy_model_files(output_dir)
+    _change_modalities_import_to_relative_import(output_dir)
diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py
new file mode 100644
index 000000000..db60f1174
--- /dev/null
+++ b/src/modalities/conversion/gpt2/conversion_model.py
@@ -0,0 +1,145 @@
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from modalities.conversion.gpt2.configuration_gpt2 import GPT2Config
+from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
+from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block, PositionTypes
+from modalities.models.model import SwiGLU
+from modalities.models.utils import ModelTypeEnum, get_model_from_config
+
+
+def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM, GPT2LLM]:
+    """Converts the modalities model to a Huggingface transformers model.
+       Both the loaded modalities model and the converted Huggingface model are returned
+       so that they can be compared.
+
+    Args:
+        modalities_config (dict): Modalities config dictionary.
+
+    Returns:
+        tuple[GPT2ForCausalLM, GPT2LLM]: Converted Hugging Face model and the original modalities model.
+    """
+    gpt2_config = convert_model_config(modalities_config)
+    hf_model = GPT2ForCausalLM(gpt2_config).to(dtype=torch.bfloat16)
+    modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
+    _copy_weights_model(hf_model, modalities_model)
+    return hf_model, modalities_model
+
+
+def convert_model_config(modalities_config: dict) -> GPT2Config:
+    """Converts the modalities model configuration to a Huggingface transformers configuration.
+       For this the model_raw or model section of the modalities config is used.
+       Corresponding entries are mapped to the Huggingface configuration.
+
+    Args:
+        modalities_config (dict): Modalities config dictionary.
+
+    Returns:
+        GPT2Config: Converted Huggingface model configuration.
+    """
+    config = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]
+
+    assert config["poe_type"] == PositionTypes.NOPE
+    assert config["activation_type"] == "swiglu"
+
+    assert config["attention_norm"]["variant_key"] == "layer_norm"
+    assert config["ffn_norm"]["variant_key"] == "layer_norm"
+    assert config["lm_head_norm"]["variant_key"] == "layer_norm"
+
+    if config["attention_implementation"] == "pytorch_flash":
+        attention_impl = "sdpa"
+    elif config["attention_implementation"] == "manual":
+        attention_impl = "eager"
+    else:
+        raise ValueError(f"Unknown or unsupported attention implementation {config['attention_implementation']}.")
+
+    return GPT2Config(
+        vocab_size=config["vocab_size"],
+        hidden_size=config["n_embd"],
+        pad_token_id=None,
+        num_hidden_layers=config["n_layer"],
+        num_key_value_heads=config["n_head_kv"],
+        num_attention_heads=config["n_head_q"],
+        intermediate_size=SwiGLU._get_hidden_dim(ffn_hidden=config["ffn_hidden"]),
+        attention_bias=config["bias"],
+        mlp_bias=config["bias"],
+        hidden_act="silu",
+        layer_norm_eps=config["ffn_norm"]["config"]["eps"],
+        layer_norm_elementwise_affine=config["ffn_norm"]["config"].get(
+            "elementwise_affine",
+            True,
+            # TODO:
+            # Temporary solution: double-check that these are the correct default values.
+            # Permanent solution: read default values from where they are defined.
+        ),
+        layer_norm_bias=config["ffn_norm"]["config"].get("bias", True),  # TODO: see comment above
+        max_position_embeddings=config["sequence_length"],
+        rope_theta=config["attention_config"]["qkv_transforms"][0]["config"]["base_freq"],
+        _attn_implementation=attention_impl,
+        output_attentions=False,
+    )
+
+
+def check_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, num_testruns: int, vocab_size: int):
+    """Tests the converted model by inputting a random token sequence and comparing the output logits of both models.
+
+    Args:
+        hf_model (GPT2ForCausalLM): Huggingface transformers model.
+        modalities_model (GPT2LLM): Modalities model.
+        num_testruns (int): Number of test runs to perform.
+        vocab_size (int): Vocabulary size of the model. (Required for generating random input tokens.)
+    """
+    for _ in tqdm(range(num_testruns), desc="Testing converted model"):
+        input_ids = torch.randint(0, vocab_size, (1, modalities_model.sequence_length), device=hf_model.device)
+        inputs = {modalities_model.sample_key: input_ids.to(modalities_model.transformer.wte.weight.device)}
+
+        with torch.no_grad():
+            llama_logits = hf_model(input_ids=input_ids).logits.to("cpu")
+            modalities_logits = modalities_model(inputs)[modalities_model.prediction_key].to("cpu")
+
+        assert llama_logits.shape == modalities_logits.shape
+        assert torch.equal(llama_logits, modalities_logits)
+
+
+def _copy_weights_model(hf_model_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
+    """Copies the weights of the modalities model to the Huggingface transformers model.
+
+    Args:
+        hf_model_model (GPT2ForCausalLM): The uninitialized Huggingface transformers model.
+                                          The weights will be copied here.
+        modalities_model (GPT2LLM): The modalities model from which the weights will be copied.
+    """
+    hf_model_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)
+    for hf_layer, modalities_layer in zip(hf_model_model.model.layers, modalities_model.transformer.h):
+        _copy_weights_attention(hf_layer, modalities_layer)
+        _copy_weights_mlp(hf_layer, modalities_layer)
+        _copy_weights_layer_norms(hf_layer, modalities_layer)
+    _copy_weights_base_modules(hf_model_model.lm_head, modalities_model.lm_head)
+    _copy_weights_base_modules(hf_model_model.model.norm, modalities_model.transformer.lm_head_norm)
+
+
+def _copy_weights_attention(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    _copy_weights_base_modules(hf_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
+    _copy_weights_base_modules(hf_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
+    _copy_weights_base_modules(hf_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
+    _copy_weights_base_modules(hf_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
+
+
+def _copy_weights_mlp(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    _copy_weights_base_modules(hf_layer.mlp.down_proj, modalities_layer.mlp.W_2)
+    _copy_weights_base_modules(hf_layer.mlp.gate_proj, modalities_layer.mlp.W)
+    _copy_weights_base_modules(hf_layer.mlp.up_proj, modalities_layer.mlp.V)
+
+
+def _copy_weights_layer_norms(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    _copy_weights_base_modules(hf_layer.input_layernorm, modalities_layer.attention_norm)
+    _copy_weights_base_modules(hf_layer.post_attention_layernorm, modalities_layer.ffn_norm)
+
+
+def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.LayerNorm):
+    assert m1.weight.shape == m2.weight.shape
+    assert (m1.bias is None and m2.bias is None) or m1.bias.shape == m2.bias.shape
+    m1.weight.data.copy_(m2.weight.data)
+    if m1.bias is not None:
+        m1.bias.data.copy_(m2.bias.data)
diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index b8cf92c30..4d6027d8b 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -20,18 +20,10 @@
 
 import argparse
 import os
-import shutil
-
-import torch
-import torch.nn as nn
-from tqdm import tqdm
 
 from modalities.config.config import load_app_config_dict
-from modalities.conversion.gpt2.configuration_gpt2 import GPT2Config
-from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
-from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block, PositionTypes
-from modalities.models.model import SwiGLU
-from modalities.models.utils import ModelTypeEnum, get_model_from_config
+from modalities.conversion.gpt2.conversion_code import transfer_model_code
+from modalities.conversion.gpt2.conversion_model import check_converted_model, convert_model_checkpoint
 
 
 def convert_gpt2(
@@ -45,6 +37,7 @@ def convert_gpt2(
        The provided config yaml file should contain the model_raw or model section with the model configuration.
        Additionally, the checkpointed_model section should be present and contain the path to the model checkpoint.
        Optionally, the function can run a number of test runs to compare the converted model with the original one.
+       If a tokenizer is specified in the config, it will be converted as well.
 
     Args:
         modalities_config_path (str): Path to the modalities config file.
@@ -70,173 +63,7 @@ def convert_gpt2(
         "AutoModelForCausalLM": "modeling_gpt2.GPT2ForCausalLM",
     }
     hf_model.save_pretrained(output_dir)
-    _transfer_model_code(output_dir)
-
-
-def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM, GPT2LLM]:
-    """Converts the modalities model to a Huggingface transformers model.
-       Both the loaded modalities model and the converted Huggingface model are returned
-       so that they can be compared.
-
-    Args:
-        modalities_config (dict): Modalities config dictionary.
-
-    Returns:
-        tuple[GPT2ForCausalLM, GPT2LLM]: Converted Hugging Face model and the original modalities model.
-    """
-    gpt2_config = convert_model_config(modalities_config)
-    hf_model = GPT2ForCausalLM(gpt2_config).to(dtype=torch.bfloat16)
-    modalities_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
-    _copy_weights_model(hf_model, modalities_model)
-    return hf_model, modalities_model
-
-
-def convert_model_config(modalities_config: dict) -> GPT2Config:
-    """Converts the modalities model configuration to a Huggingface transformers configuration.
-       For this the model_raw or model section of the modalities config is used.
-       Corresponding entries are mapped to the Huggingface configuration.
-
-    Args:
-        modalities_config (dict): Modalities config dictionary.
-
-    Returns:
-        GPT2Config: Converted Huggingface model configuration.
-    """
-    config = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]
-
-    assert config["poe_type"] == PositionTypes.NOPE
-    assert config["activation_type"] == "swiglu"
-
-    assert config["attention_norm"]["variant_key"] == "layer_norm"
-    assert config["ffn_norm"]["variant_key"] == "layer_norm"
-    assert config["lm_head_norm"]["variant_key"] == "layer_norm"
-
-    if config["attention_implementation"] == "pytorch_flash":
-        attention_impl = "sdpa"
-    elif config["attention_implementation"] == "manual":
-        attention_impl = "eager"
-    else:
-        raise ValueError(f"Unknown or unsupported attention implementation {config['attention_implementation']}.")
-
-    return GPT2Config(
-        vocab_size=config["vocab_size"],
-        hidden_size=config["n_embd"],
-        pad_token_id=None,
-        num_hidden_layers=config["n_layer"],
-        num_key_value_heads=config["n_head_kv"],
-        num_attention_heads=config["n_head_q"],
-        intermediate_size=SwiGLU._get_hidden_dim(ffn_hidden=config["ffn_hidden"]),
-        attention_bias=config["bias"],
-        mlp_bias=config["bias"],
-        hidden_act="silu",
-        layer_norm_eps=config["ffn_norm"]["config"]["eps"],
-        layer_norm_elementwise_affine=config["ffn_norm"]["config"].get(
-            "elementwise_affine",
-            True,
-            # TODO:
-            # Temporary solution: double-check that these are the correct default values.
-            # Permanent solution: read default values from where they are defined.
-        ),
-        layer_norm_bias=config["ffn_norm"]["config"].get("bias", True),  # TODO: see comment above
-        max_position_embeddings=config["sequence_length"],
-        rope_theta=config["attention_config"]["qkv_transforms"][0]["config"]["base_freq"],
-        _attn_implementation=attention_impl,
-        output_attentions=False,
-    )
-
-
-def check_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM, num_testruns: int, vocab_size: int):
-    """Tests the converted model by inputting a random token sequence and comparing the output logits of both models.
-
-    Args:
-        hf_model (GPT2ForCausalLM): Huggingface transformers model.
-        modalities_model (GPT2LLM): Modalities model.
-        num_testruns (int): Number of test runs to perform.
-        vocab_size (int): Vocabulary size of the model. (Required for generating random input tokens.)
-    """
-    for _ in tqdm(range(num_testruns), desc="Testing converted model"):
-        input_ids = torch.randint(0, vocab_size, (1, modalities_model.sequence_length), device=hf_model.device)
-        inputs = {modalities_model.sample_key: input_ids.to(modalities_model.transformer.wte.weight.device)}
-
-        with torch.no_grad():
-            llama_logits = hf_model(input_ids=input_ids).logits.to("cpu")
-            modalities_logits = modalities_model(inputs)[modalities_model.prediction_key].to("cpu")
-
-        assert llama_logits.shape == modalities_logits.shape
-        assert torch.equal(llama_logits, modalities_logits)
-
-
-def _copy_weights_model(hf_model_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
-    """Copies the weights of the modalities model to the Huggingface transformers model.
-
-    Args:
-        hf_model_model (GPT2ForCausalLM): The uninitialized Huggingface transformers model.
-                                          The weights will be copied here.
-        modalities_model (GPT2LLM): The modalities model from which the weights will be copied.
-    """
-    hf_model_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)
-    for hf_layer, modalities_layer in zip(hf_model_model.model.layers, modalities_model.transformer.h):
-        _copy_weights_attention(hf_layer, modalities_layer)
-        _copy_weights_mlp(hf_layer, modalities_layer)
-        _copy_weights_layer_norms(hf_layer, modalities_layer)
-    _copy_weights_base_modules(hf_model_model.lm_head, modalities_model.lm_head)
-    _copy_weights_base_modules(hf_model_model.model.norm, modalities_model.transformer.lm_head_norm)
-
-
-def _copy_weights_attention(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    _copy_weights_base_modules(hf_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
-    _copy_weights_base_modules(hf_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
-    _copy_weights_base_modules(hf_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
-    _copy_weights_base_modules(hf_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
-
-
-def _copy_weights_mlp(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    _copy_weights_base_modules(hf_layer.mlp.down_proj, modalities_layer.mlp.W_2)
-    _copy_weights_base_modules(hf_layer.mlp.gate_proj, modalities_layer.mlp.W)
-    _copy_weights_base_modules(hf_layer.mlp.up_proj, modalities_layer.mlp.V)
-
-
-def _copy_weights_layer_norms(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    _copy_weights_base_modules(hf_layer.input_layernorm, modalities_layer.attention_norm)
-    _copy_weights_base_modules(hf_layer.post_attention_layernorm, modalities_layer.ffn_norm)
-
-
-def _copy_weights_base_modules(m1: nn.Linear | nn.LayerNorm, m2: nn.Linear | nn.LayerNorm):
-    assert m1.weight.shape == m2.weight.shape
-    assert (m1.bias is None and m2.bias is None) or m1.bias.shape == m2.bias.shape
-    m1.weight.data.copy_(m2.weight.data)
-    if m1.bias is not None:
-        m1.bias.data.copy_(m2.bias.data)
-
-
-def _transfer_model_code(output_dir: str):
-    """Copies the required model code to the output directory and replaces modalities imports.
-       This allows the converted model to be used without the modalities package via:
-       >>> from transformers import AutoModelForCausalLM
-       >>> model = AutoModelForCausalLM.from_pretrained("path/to/converted/model", trust_remote_code=True)
-
-    Args:
-        output_dir (str): Directory of the converted model.
-    """
-    _copy_model_files(output_dir)
-    _change_modalities_import_to_relative_import(output_dir)
-
-
-def _copy_model_files(output_dir: str):
-    source_dir = os.path.dirname(__file__)
-    modeling_gpt2_path = os.path.join(source_dir, "modeling_gpt2.py")
-    configuration_gpt2_path = os.path.join(source_dir, "configuration_gpt2.py")
-    shutil.copy(modeling_gpt2_path, output_dir)
-    shutil.copy(configuration_gpt2_path, output_dir)
-
-
-def _change_modalities_import_to_relative_import(output_dir: str):
-    target_modeling_file = os.path.join(output_dir, "modeling_gpt2.py")
-    with open(target_modeling_file, "r") as file:
-        content = file.read()
-    content = content.replace("modalities.conversion.gpt2.configuration_gpt2", ".configuration_gpt2")
-    with open(target_modeling_file, "w") as file:
-        file.write(content)
+    transfer_model_code(output_dir)
 
 
 if __name__ == "__main__":
diff --git a/tests/conversion/gpt2/conftest.py b/tests/conversion/gpt2/conftest.py
new file mode 100644
index 000000000..dd7cad8d5
--- /dev/null
+++ b/tests/conversion/gpt2/conftest.py
@@ -0,0 +1,100 @@
+import os
+import shutil
+from pathlib import Path
+
+import pytest
+import torch
+import torch.nn as nn
+
+from modalities.config.config import load_app_config_dict
+from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
+from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
+from modalities.models.utils import ModelTypeEnum, get_model_from_config
+from tests.conftest import _ROOT_DIR
+
+
+@pytest.fixture()
+def gpt2_config_path(tmp_path: Path, initialized_model: GPT2LLM, config_file_path: str) -> str:
+    new_config_filename = tmp_path / "gpt2_config_test.yaml"
+    model_path = tmp_path / "model.pth"
+    shutil.copy(config_file_path, new_config_filename)
+    torch.save(initialized_model.state_dict(), model_path)
+    with open(new_config_filename, "r") as file:
+        content = file.read()
+    content = content.replace("checkpoint_path: null", f"checkpoint_path: {model_path}")
+    with open(new_config_filename, "w") as file:
+        file.write(content)
+    return str(new_config_filename)
+
+
+@pytest.fixture()
+def initialized_model(set_env, config_dict: dict) -> GPT2LLM:
+    model = get_model_from_config(config=config_dict, model_type=ModelTypeEnum.MODEL)
+    assert isinstance(model, GPT2LLM)
+    return model
+
+
+@pytest.fixture()
+def set_env():
+    os.environ["LOCAL_RANK"] = "0"
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+
+
+@pytest.fixture()
+def config_dict(config_file_path: Path) -> dict:
+    return load_app_config_dict(config_file_path=config_file_path)
+
+
+@pytest.fixture()
+def config_file_path(config_file_name: str) -> Path:
+    config_file_path = _ROOT_DIR / Path("tests/conversion/test_configs/" + config_file_name)
+    return config_file_path
+
+
+@pytest.fixture(params=["gpt2_config_test.yaml"])
+def config_file_name(request) -> str:
+    return request.param
+
+
+@pytest.fixture
+def device() -> str:
+    return "cpu"
+
+
+def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
+    converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device)
+    assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight)
+    for i, (llama_layer, modalities_layer) in enumerate(
+        zip(converted_model.model.layers, modalities_model.transformer.h)
+    ):
+        check_same_weight_attention(llama_layer, modalities_layer)
+        check_same_weight_mlp(llama_layer, modalities_layer)
+        check_same_weight_layer_norms(llama_layer, modalities_layer)
+    check_same_weight_base_modules(converted_model.lm_head, modalities_model.lm_head)
+    check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm)
+
+
+def check_same_weight_attention(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
+
+
+def check_same_weight_mlp(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.mlp.down_proj, modalities_layer.mlp.W_2)
+    check_same_weight_base_modules(llama_layer.mlp.gate_proj, modalities_layer.mlp.W)
+    check_same_weight_base_modules(llama_layer.mlp.up_proj, modalities_layer.mlp.V)
+
+
+def check_same_weight_layer_norms(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.input_layernorm, modalities_layer.attention_norm)
+    check_same_weight_base_modules(llama_layer.post_attention_layernorm, modalities_layer.ffn_norm)
+
+
+def check_same_weight_base_modules(l1: nn.Linear | nn.LayerNorm, l2: nn.Linear | nn.LayerNorm):
+    assert torch.equal(l1.weight, l2.weight)
+    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
+    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
+    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
diff --git a/tests/conversion/gpt2/test_conversion_code.py b/tests/conversion/gpt2/test_conversion_code.py
new file mode 100644
index 000000000..72d04b2e6
--- /dev/null
+++ b/tests/conversion/gpt2/test_conversion_code.py
@@ -0,0 +1,33 @@
+from pathlib import Path
+
+from modalities.conversion.gpt2.conversion_code import transfer_model_code
+
+
+def test_modeling_gpt2_gets_transferred_with_model_files(tmp_path: Path):
+    modeling_gpt2_path = tmp_path / "modeling_gpt2.py"
+    assert not modeling_gpt2_path.exists()
+    transfer_model_code(tmp_path)
+    assert modeling_gpt2_path.exists()
+
+
+def test_configuration_gpt2_gets_transferred_with_model_files(tmp_path: Path):
+    configuration_gpt2_path = tmp_path / "configuration_gpt2.py"
+    assert not configuration_gpt2_path.exists()
+    transfer_model_code(tmp_path)
+    assert configuration_gpt2_path.exists()
+
+
+def test_transferred_modeling_gpt2_does_not_import_from_modalities(tmp_path: Path):
+    transfer_model_code(tmp_path)
+    with open(tmp_path / "modeling_gpt2.py") as f:
+        text = f.read()
+        assert "from modalities" not in text
+        assert "import modalities" not in text
+
+
+def test_transferred_configuration_gpt2_does_not_import_from_modalities(tmp_path: Path):
+    transfer_model_code(tmp_path)
+    with open(tmp_path / "configuration_gpt2.py") as f:
+        text = f.read()
+        assert "from modalities" not in text
+        assert "import modalities" not in text
diff --git a/tests/conversion/gpt2/test_conversion_model.py b/tests/conversion/gpt2/test_conversion_model.py
new file mode 100644
index 000000000..60a9d743b
--- /dev/null
+++ b/tests/conversion/gpt2/test_conversion_model.py
@@ -0,0 +1,38 @@
+import pytest
+import torch
+import torch.nn as nn
+
+from modalities.conversion.gpt2.conversion_model import _copy_weights_base_modules
+
+
+def test_copying_base_modules_weights_yields_identical_modules():
+    m1 = nn.Linear(10, 10, bias=True)
+    m2 = nn.Linear(10, 10, bias=True)
+    m2.weight.data = torch.randn(10, 10)
+    m2.bias.data = torch.randn(10)
+
+    _copy_weights_base_modules(m1, m2)
+
+    assert torch.equal(m1.weight.data, m2.weight.data)
+    assert torch.equal(m1.bias.data, m2.bias.data)
+
+
+def test_copying_base_modules_works_when_bias_is_false():
+    m1 = nn.Linear(10, 10, bias=False)
+    m2 = nn.Linear(10, 10, bias=False)
+    m2.weight.data = torch.randn(10, 10)
+
+    _copy_weights_base_modules(m1, m2)
+
+    assert torch.equal(m1.weight.data, m2.weight.data)
+    assert m1.bias == m2.bias and m2.bias is None
+
+
+def test_copying_base_modules_fails_if_bias_settings_mismatch():
+    m1 = nn.Linear(10, 10, bias=False)
+    m2 = nn.Linear(10, 10, bias=True)
+    m2.weight.data = torch.randn(10, 10)
+    m2.bias.data = torch.randn(10)
+
+    with pytest.raises(AttributeError):
+        _copy_weights_base_modules(m1, m2)
diff --git a/tests/conversion/gpt2/test_convert_gpt2.py b/tests/conversion/gpt2/test_convert_gpt2.py
index ff5f1a3cd..0f3d203f2 100644
--- a/tests/conversion/gpt2/test_convert_gpt2.py
+++ b/tests/conversion/gpt2/test_convert_gpt2.py
@@ -1,23 +1,15 @@
-import os
-import shutil
 from pathlib import Path
 
-import pytest
 import torch
 import torch.nn as nn
 from transformers import AutoModelForCausalLM
 
 from modalities.config.config import load_app_config_dict
-from modalities.conversion.gpt2.convert_gpt2 import (
-    _copy_weights_base_modules,
-    _transfer_model_code,
-    check_converted_model,
-    convert_gpt2,
-)
+from modalities.conversion.gpt2.conversion_model import check_converted_model
+from modalities.conversion.gpt2.convert_gpt2 import convert_gpt2
 from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
 from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
 from modalities.models.utils import ModelTypeEnum, get_model_from_config
-from tests.conftest import _ROOT_DIR
 
 
 def test_converting_gpt2_does_not_change_weights(tmp_path: Path, gpt2_config_path: str):
@@ -41,118 +33,6 @@ def test_converting_gpt2_does_not_change_outputs(tmp_path: Path, gpt2_config_pat
     check_converted_model(converted_model, original_model, 1, vocab_size)
 
 
-def test_copying_base_modules_weights_yields_identical_modules():
-    m1 = nn.Linear(10, 10, bias=True)
-    m2 = nn.Linear(10, 10, bias=True)
-    m2.weight.data = torch.randn(10, 10)
-    m2.bias.data = torch.randn(10)
-
-    _copy_weights_base_modules(m1, m2)
-
-    assert torch.equal(m1.weight.data, m2.weight.data)
-    assert torch.equal(m1.bias.data, m2.bias.data)
-
-
-def test_copying_base_modules_works_when_bias_is_false():
-    m1 = nn.Linear(10, 10, bias=False)
-    m2 = nn.Linear(10, 10, bias=False)
-    m2.weight.data = torch.randn(10, 10)
-
-    _copy_weights_base_modules(m1, m2)
-
-    assert torch.equal(m1.weight.data, m2.weight.data)
-    assert m1.bias == m2.bias and m2.bias is None
-
-
-def test_copying_base_modules_fails_if_bias_settings_mismatch():
-    m1 = nn.Linear(10, 10, bias=False)
-    m2 = nn.Linear(10, 10, bias=True)
-    m2.weight.data = torch.randn(10, 10)
-    m2.bias.data = torch.randn(10)
-
-    with pytest.raises(AttributeError):
-        _copy_weights_base_modules(m1, m2)
-
-
-def test_modeling_gpt2_gets_transferred_with_model_files(tmp_path: Path):
-    modeling_gpt2_path = tmp_path / "modeling_gpt2.py"
-    assert not modeling_gpt2_path.exists()
-    _transfer_model_code(tmp_path)
-    assert modeling_gpt2_path.exists()
-
-
-def test_configuration_gpt2_gets_transferred_with_model_files(tmp_path: Path):
-    configuration_gpt2_path = tmp_path / "configuration_gpt2.py"
-    assert not configuration_gpt2_path.exists()
-    _transfer_model_code(tmp_path)
-    assert configuration_gpt2_path.exists()
-
-
-def test_transferred_modeling_gpt2_does_not_import_from_modalities(tmp_path: Path):
-    _transfer_model_code(tmp_path)
-    with open(tmp_path / "modeling_gpt2.py") as f:
-        text = f.read()
-        assert "from modalities" not in text
-        assert "import modalities" not in text
-
-
-def test_transferred_configuration_gpt2_does_not_import_from_modalities(tmp_path: Path):
-    _transfer_model_code(tmp_path)
-    with open(tmp_path / "configuration_gpt2.py") as f:
-        text = f.read()
-        assert "from modalities" not in text
-        assert "import modalities" not in text
-
-
-@pytest.fixture()
-def gpt2_config_path(tmp_path: Path, initialized_model: GPT2LLM, config_file_path: str) -> str:
-    new_config_filename = tmp_path / "gpt2_config_test.yaml"
-    model_path = tmp_path / "model.pth"
-    shutil.copy(config_file_path, new_config_filename)
-    torch.save(initialized_model.state_dict(), model_path)
-    with open(new_config_filename, "r") as file:
-        content = file.read()
-    content = content.replace("checkpoint_path: null", f"checkpoint_path: {model_path}")
-    with open(new_config_filename, "w") as file:
-        file.write(content)
-    return str(new_config_filename)
-
-
-@pytest.fixture()
-def initialized_model(set_env, config_dict: dict) -> GPT2LLM:
-    model = get_model_from_config(config=config_dict, model_type=ModelTypeEnum.MODEL)
-    assert isinstance(model, GPT2LLM)
-    return model
-
-
-@pytest.fixture()
-def set_env():
-    os.environ["LOCAL_RANK"] = "0"
-    os.environ["RANK"] = "0"
-    os.environ["WORLD_SIZE"] = "1"
-
-
-@pytest.fixture()
-def config_dict(config_file_path: Path) -> dict:
-    return load_app_config_dict(config_file_path=config_file_path)
-
-
-@pytest.fixture()
-def config_file_path(config_file_name: str) -> Path:
-    config_file_path = _ROOT_DIR / Path("tests/conversion/test_configs/" + config_file_name)
-    return config_file_path
-
-
-@pytest.fixture(params=["gpt2_config_test.yaml"])
-def config_file_name(request) -> str:
-    return request.param
-
-
-@pytest.fixture
-def device() -> str:
-    return "cpu"
-
-
 def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
     converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device)
     assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight)

From d43b8d29b5c51322c126defbffd23b0979d3de99 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 25 Feb 2025 13:09:48 +0100
Subject: [PATCH 20/28] test(huggingface): Additional tests and some
 refactoring.

---
 tests/conversion/gpt2/conftest.py             | 53 ++-----------------
 tests/conversion/gpt2/helper.py               | 41 ++++++++++++++
 .../conversion/gpt2/test_conversion_model.py  | 27 ++++++++--
 tests/conversion/gpt2/test_convert_gpt2.py    | 42 +--------------
 .../test_configs/gpt2_config_test.yaml        |  4 +-
 5 files changed, 70 insertions(+), 97 deletions(-)
 create mode 100644 tests/conversion/gpt2/helper.py

diff --git a/tests/conversion/gpt2/conftest.py b/tests/conversion/gpt2/conftest.py
index dd7cad8d5..3deaf3364 100644
--- a/tests/conversion/gpt2/conftest.py
+++ b/tests/conversion/gpt2/conftest.py
@@ -4,11 +4,9 @@
 
 import pytest
 import torch
-import torch.nn as nn
 
 from modalities.config.config import load_app_config_dict
-from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
-from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
+from modalities.models.gpt2.gpt2_model import GPT2LLM
 from modalities.models.utils import ModelTypeEnum, get_model_from_config
 from tests.conftest import _ROOT_DIR
 
@@ -28,8 +26,8 @@ def gpt2_config_path(tmp_path: Path, initialized_model: GPT2LLM, config_file_pat
 
 
 @pytest.fixture()
-def initialized_model(set_env, config_dict: dict) -> GPT2LLM:
-    model = get_model_from_config(config=config_dict, model_type=ModelTypeEnum.MODEL)
+def initialized_model(set_env, modalities_config_dict: dict) -> GPT2LLM:
+    model = get_model_from_config(config=modalities_config_dict, model_type=ModelTypeEnum.MODEL)
     assert isinstance(model, GPT2LLM)
     return model
 
@@ -42,7 +40,7 @@ def set_env():
 
 
 @pytest.fixture()
-def config_dict(config_file_path: Path) -> dict:
+def modalities_config_dict(config_file_path: Path) -> dict:
     return load_app_config_dict(config_file_path=config_file_path)
 
 
@@ -55,46 +53,3 @@ def config_file_path(config_file_name: str) -> Path:
 @pytest.fixture(params=["gpt2_config_test.yaml"])
 def config_file_name(request) -> str:
     return request.param
-
-
-@pytest.fixture
-def device() -> str:
-    return "cpu"
-
-
-def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
-    converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device)
-    assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight)
-    for i, (llama_layer, modalities_layer) in enumerate(
-        zip(converted_model.model.layers, modalities_model.transformer.h)
-    ):
-        check_same_weight_attention(llama_layer, modalities_layer)
-        check_same_weight_mlp(llama_layer, modalities_layer)
-        check_same_weight_layer_norms(llama_layer, modalities_layer)
-    check_same_weight_base_modules(converted_model.lm_head, modalities_model.lm_head)
-    check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm)
-
-
-def check_same_weight_attention(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    check_same_weight_base_modules(llama_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
-    check_same_weight_base_modules(llama_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
-    check_same_weight_base_modules(llama_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
-    check_same_weight_base_modules(llama_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
-
-
-def check_same_weight_mlp(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    check_same_weight_base_modules(llama_layer.mlp.down_proj, modalities_layer.mlp.W_2)
-    check_same_weight_base_modules(llama_layer.mlp.gate_proj, modalities_layer.mlp.W)
-    check_same_weight_base_modules(llama_layer.mlp.up_proj, modalities_layer.mlp.V)
-
-
-def check_same_weight_layer_norms(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    check_same_weight_base_modules(llama_layer.input_layernorm, modalities_layer.attention_norm)
-    check_same_weight_base_modules(llama_layer.post_attention_layernorm, modalities_layer.ffn_norm)
-
-
-def check_same_weight_base_modules(l1: nn.Linear | nn.LayerNorm, l2: nn.Linear | nn.LayerNorm):
-    assert torch.equal(l1.weight, l2.weight)
-    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
-    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
-    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
diff --git a/tests/conversion/gpt2/helper.py b/tests/conversion/gpt2/helper.py
new file mode 100644
index 000000000..40eb75ba0
--- /dev/null
+++ b/tests/conversion/gpt2/helper.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
+from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
+
+
+def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
+    converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device)
+    assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight)
+    for i, (llama_layer, modalities_layer) in enumerate(
+        zip(converted_model.model.layers, modalities_model.transformer.h)
+    ):
+        check_same_weight_attention(llama_layer, modalities_layer)
+        check_same_weight_mlp(llama_layer, modalities_layer)
+        check_same_weight_layer_norms(llama_layer, modalities_layer)
+    check_same_weight_base_modules(converted_model.lm_head, modalities_model.lm_head)
+    check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm)
+
+
+def check_same_weight_attention(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
+    check_same_weight_base_modules(llama_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
+
+
+def check_same_weight_mlp(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.mlp.down_proj, modalities_layer.mlp.W_2)
+    check_same_weight_base_modules(llama_layer.mlp.gate_proj, modalities_layer.mlp.W)
+    check_same_weight_base_modules(llama_layer.mlp.up_proj, modalities_layer.mlp.V)
+
+
+def check_same_weight_layer_norms(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
+    check_same_weight_base_modules(llama_layer.input_layernorm, modalities_layer.attention_norm)
+    check_same_weight_base_modules(llama_layer.post_attention_layernorm, modalities_layer.ffn_norm)
+
+
+def check_same_weight_base_modules(l1: nn.Linear | nn.LayerNorm, l2: nn.Linear | nn.LayerNorm):
+    assert torch.equal(l1.weight, l2.weight)
+    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
diff --git a/tests/conversion/gpt2/test_conversion_model.py b/tests/conversion/gpt2/test_conversion_model.py
index 60a9d743b..b75223dbf 100644
--- a/tests/conversion/gpt2/test_conversion_model.py
+++ b/tests/conversion/gpt2/test_conversion_model.py
@@ -2,7 +2,26 @@
 import torch
 import torch.nn as nn
 
-from modalities.conversion.gpt2.conversion_model import _copy_weights_base_modules
+from modalities.config.config import load_app_config_dict
+from modalities.conversion.gpt2.conversion_model import (
+    _copy_weights_base_modules,
+    check_converted_model,
+    convert_model_checkpoint,
+)
+from tests.conversion.gpt2.helper import check_same_weight_base_modules, check_same_weight_model
+
+
+def test_convert_model_checkpoint_does_not_change_weights(gpt2_config_path: str):
+    modalities_config = load_app_config_dict(gpt2_config_path)
+    hf_model, modalities_model = convert_model_checkpoint(modalities_config)
+    check_same_weight_model(hf_model, modalities_model)
+
+
+def test_convert_model_checkpoint_produces_same_logits_as_original(gpt2_config_path: str):
+    modalities_config = load_app_config_dict(gpt2_config_path)
+    hf_model, modalities_model = convert_model_checkpoint(modalities_config)
+    vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
+    check_converted_model(hf_model, modalities_model, num_testruns=1, vocab_size=vocab_size)
 
 
 def test_copying_base_modules_weights_yields_identical_modules():
@@ -13,8 +32,7 @@ def test_copying_base_modules_weights_yields_identical_modules():
 
     _copy_weights_base_modules(m1, m2)
 
-    assert torch.equal(m1.weight.data, m2.weight.data)
-    assert torch.equal(m1.bias.data, m2.bias.data)
+    check_same_weight_base_modules(m1, m2)
 
 
 def test_copying_base_modules_works_when_bias_is_false():
@@ -24,8 +42,7 @@ def test_copying_base_modules_works_when_bias_is_false():
 
     _copy_weights_base_modules(m1, m2)
 
-    assert torch.equal(m1.weight.data, m2.weight.data)
-    assert m1.bias == m2.bias and m2.bias is None
+    check_same_weight_base_modules(m1, m2)
 
 
 def test_copying_base_modules_fails_if_bias_settings_mismatch():
diff --git a/tests/conversion/gpt2/test_convert_gpt2.py b/tests/conversion/gpt2/test_convert_gpt2.py
index 0f3d203f2..c7cbd573c 100644
--- a/tests/conversion/gpt2/test_convert_gpt2.py
+++ b/tests/conversion/gpt2/test_convert_gpt2.py
@@ -1,15 +1,13 @@
 from pathlib import Path
 
 import torch
-import torch.nn as nn
 from transformers import AutoModelForCausalLM
 
 from modalities.config.config import load_app_config_dict
 from modalities.conversion.gpt2.conversion_model import check_converted_model
 from modalities.conversion.gpt2.convert_gpt2 import convert_gpt2
-from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
-from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block
 from modalities.models.utils import ModelTypeEnum, get_model_from_config
+from tests.conversion.gpt2.helper import check_same_weight_model
 
 
 def test_converting_gpt2_does_not_change_weights(tmp_path: Path, gpt2_config_path: str):
@@ -31,41 +29,3 @@ def test_converting_gpt2_does_not_change_outputs(tmp_path: Path, gpt2_config_pat
     ).to(dtype=torch.bfloat16)
     vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
     check_converted_model(converted_model, original_model, 1, vocab_size)
-
-
-def check_same_weight_model(converted_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
-    converted_model.to(device=modalities_model.transformer.h[0].attn.q_attn.weight.device)
-    assert torch.equal(converted_model.model.embed_tokens.weight, modalities_model.transformer.wte.weight)
-    for i, (llama_layer, modalities_layer) in enumerate(
-        zip(converted_model.model.layers, modalities_model.transformer.h)
-    ):
-        check_same_weight_attention(llama_layer, modalities_layer)
-        check_same_weight_mlp(llama_layer, modalities_layer)
-        check_same_weight_layer_norms(llama_layer, modalities_layer)
-    check_same_weight_base_modules(converted_model.lm_head, modalities_model.lm_head)
-    check_same_weight_base_modules(converted_model.model.norm, modalities_model.transformer.lm_head_norm)
-
-
-def check_same_weight_attention(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    check_same_weight_base_modules(llama_layer.self_attn.q_proj, modalities_layer.attn.q_attn)
-    check_same_weight_base_modules(llama_layer.self_attn.k_proj, modalities_layer.attn.k_attn)
-    check_same_weight_base_modules(llama_layer.self_attn.v_proj, modalities_layer.attn.v_attn)
-    check_same_weight_base_modules(llama_layer.self_attn.o_proj, modalities_layer.attn.c_proj)
-
-
-def check_same_weight_mlp(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    check_same_weight_base_modules(llama_layer.mlp.down_proj, modalities_layer.mlp.W_2)
-    check_same_weight_base_modules(llama_layer.mlp.gate_proj, modalities_layer.mlp.W)
-    check_same_weight_base_modules(llama_layer.mlp.up_proj, modalities_layer.mlp.V)
-
-
-def check_same_weight_layer_norms(llama_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
-    check_same_weight_base_modules(llama_layer.input_layernorm, modalities_layer.attention_norm)
-    check_same_weight_base_modules(llama_layer.post_attention_layernorm, modalities_layer.ffn_norm)
-
-
-def check_same_weight_base_modules(l1: nn.Linear | nn.LayerNorm, l2: nn.Linear | nn.LayerNorm):
-    assert torch.equal(l1.weight, l2.weight)
-    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
-    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
-    assert (l1.bias is None and l2.bias is None) or torch.equal(l1.bias, l2.bias)
diff --git a/tests/conversion/test_configs/gpt2_config_test.yaml b/tests/conversion/test_configs/gpt2_config_test.yaml
index 7d03fdb8c..dd6d3deed 100644
--- a/tests/conversion/test_configs/gpt2_config_test.yaml
+++ b/tests/conversion/test_configs/gpt2_config_test.yaml
@@ -7,10 +7,10 @@ model:
     sequence_length: 256
     prediction_key: logits
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
-    n_layer: 6
+    n_layer: 4
     n_head_q: 12
     n_head_kv: 12
-    ffn_hidden: 2048
+    ffn_hidden: 1024
     n_embd: 768
     dropout: 0.0
     bias: false # true # True: bias in Linears, like GPT-2. False: a bit better and faster

From 836e9343de835c6d43e6fb8ad3b752c2bf6ad0f3 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 25 Feb 2025 14:46:53 +0100
Subject: [PATCH 21/28] revert(checkpointing): For now, forcing weights_only is
 not supposed to be added.

---
 src/modalities/checkpointing/torch/torch_checkpoint_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modalities/checkpointing/torch/torch_checkpoint_loading.py b/src/modalities/checkpointing/torch/torch_checkpoint_loading.py
index 14d2800ac..2837c42a2 100644
--- a/src/modalities/checkpointing/torch/torch_checkpoint_loading.py
+++ b/src/modalities/checkpointing/torch/torch_checkpoint_loading.py
@@ -45,7 +45,7 @@ def load_model_checkpoint(self, model: nn.Module, file_path: Path) -> nn.Module:
         else:
             model = model.to(self.device)
 
-        model_state = torch.load(file_path, map_location=self.device, weights_only=False)
+        model_state = torch.load(file_path, map_location=self.device)
         model_state_dtype = list(model_state.values())[0].dtype
 
         if self.precision is not None and self.precision.value != model_state_dtype:

From f4cc96223473432dcf6b80291ff198d090cf0223 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Tue, 25 Feb 2025 13:57:46 +0000
Subject: [PATCH 22/28] fix(conversion): check that criteria for conversion are
 fulfilled

---
 .../conversion/gpt2/conversion_model.py       | 32 +++++++++++++------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py
index db60f1174..9069b41b1 100644
--- a/src/modalities/conversion/gpt2/conversion_model.py
+++ b/src/modalities/conversion/gpt2/conversion_model.py
@@ -27,6 +27,26 @@ def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM,
     return hf_model, modalities_model
 
 
+def _check_conversion_criteria(model_config: dict) -> None:
+    """Checks that the modalities config fulfills criteria necessary for conversion
+
+    Args:
+        model_config (dict): model or model_raw part of the Modalities config dictionary.
+
+    Returns:
+        None
+    """
+    assert model_config["poe_type"] == PositionTypes.NOPE
+    assert model_config["bias"] is False
+    assert model_config["activation_type"] == "swiglu"
+    assert model_config["attention_implementation"] in ["pytorch_flash", "manual"]
+
+    for norm in ["attention_norm", "ffn_norm", "lm_head_norm"]:
+        assert model_config[norm]["variant_key"] == "layer_norm"
+        assert model_config[norm]["config"].get("elementwise_affine", True) is True  # True = default setting
+        assert model_config[norm]["config"].get("bias", True) is True  # True = default setting
+
+
 def convert_model_config(modalities_config: dict) -> GPT2Config:
     """Converts the modalities model configuration to a Huggingface transformers configuration.
        For this the model_raw or model section of the modalities config is used.
@@ -40,12 +60,7 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
     """
     config = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]
 
-    assert config["poe_type"] == PositionTypes.NOPE
-    assert config["activation_type"] == "swiglu"
-
-    assert config["attention_norm"]["variant_key"] == "layer_norm"
-    assert config["ffn_norm"]["variant_key"] == "layer_norm"
-    assert config["lm_head_norm"]["variant_key"] == "layer_norm"
+    _check_conversion_criteria(config)
 
     if config["attention_implementation"] == "pytorch_flash":
         attention_impl = "sdpa"
@@ -69,11 +84,8 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
         layer_norm_elementwise_affine=config["ffn_norm"]["config"].get(
             "elementwise_affine",
             True,
-            # TODO:
-            # Temporary solution: double-check that these are the correct default values.
-            # Permanent solution: read default values from where they are defined.
         ),
-        layer_norm_bias=config["ffn_norm"]["config"].get("bias", True),  # TODO: see comment above
+        layer_norm_bias=config["ffn_norm"]["config"].get("bias", True),
         max_position_embeddings=config["sequence_length"],
         rope_theta=config["attention_config"]["qkv_transforms"][0]["config"]["base_freq"],
         _attn_implementation=attention_impl,

From cd75707bb74c9c0ef829c81986c10e1dc5f62049 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 25 Feb 2025 15:39:33 +0100
Subject: [PATCH 23/28] docs(huggingface): Removed sentence in doc string that
 is not true yet.

---
 src/modalities/conversion/gpt2/convert_gpt2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index 4d6027d8b..693875a39 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -37,7 +37,6 @@ def convert_gpt2(
        The provided config yaml file should contain the model_raw or model section with the model configuration.
        Additionally, the checkpointed_model section should be present and contain the path to the model checkpoint.
        Optionally, the function can run a number of test runs to compare the converted model with the original one.
-       If a tokenizer is specified in the config, it will be converted as well.
 
     Args:
         modalities_config_path (str): Path to the modalities config file.

From 75524d47f29f92371ddae692a71c55c2605de1ed Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 25 Feb 2025 15:55:59 +0100
Subject: [PATCH 24/28] test(huggingface): Added some variety in bias settings
 of test config for hf conversion.

---
 tests/conversion/test_configs/gpt2_config_test.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/conversion/test_configs/gpt2_config_test.yaml b/tests/conversion/test_configs/gpt2_config_test.yaml
index dd6d3deed..56a5b9bfa 100644
--- a/tests/conversion/test_configs/gpt2_config_test.yaml
+++ b/tests/conversion/test_configs/gpt2_config_test.yaml
@@ -13,7 +13,7 @@ model:
     ffn_hidden: 1024
     n_embd: 768
     dropout: 0.0
-    bias: false # true # True: bias in Linears, like GPT-2. False: a bit better and faster
+    bias: false # True: bias in Linears, like GPT-2. False: a bit better and faster
     attention_config:
       qkv_transforms:
       - type_hint: RotaryTransform
@@ -30,21 +30,21 @@ model:
       config:
         normalized_shape: ${model.config.n_embd}
         eps: 1e-5
-        # bias: true
+        bias: true
     ffn_norm:
       component_key: layer_norm
       variant_key: layer_norm
       config:
         normalized_shape: ${model.config.n_embd}
         eps: 1e-5
-        # bias: true
+        bias: false
     lm_head_norm:
       component_key: layer_norm
       variant_key: layer_norm
       config:
         normalized_shape: ${model.config.n_embd}
         eps: 1e-5
-        # bias: true
+        bias: true
 
 checkpointed_model:
   component_key: model

From cf3cba651386d79ad59344adf33d33bef2983128 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 4 Mar 2025 14:15:00 +0100
Subject: [PATCH 25/28] fix(huggingface): The config conversion now only allows
 all norms to have the same setting.

Having different settings is currently not supported by our hf model.
Also, some additionally minor fixes and refactorings.
---
 .../conversion/gpt2/conversion_model.py       | 93 +++++++++++--------
 .../conversion/gpt2/convert_gpt2.py           |  3 +-
 tests/conversion/gpt2/test_conversion_code.py |  8 +-
 tests/conversion/gpt2/test_convert_gpt2.py    |  4 +-
 .../test_configs/gpt2_config_test.yaml        | 14 +--
 5 files changed, 69 insertions(+), 53 deletions(-)

diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py
index 9069b41b1..687a3dacf 100644
--- a/src/modalities/conversion/gpt2/conversion_model.py
+++ b/src/modalities/conversion/gpt2/conversion_model.py
@@ -4,6 +4,7 @@
 
 from modalities.conversion.gpt2.configuration_gpt2 import GPT2Config
 from modalities.conversion.gpt2.modeling_gpt2 import GPT2DecoderLayer, GPT2ForCausalLM
+from modalities.models.components.layer_norms import LayerNormConfig
 from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2Block, PositionTypes
 from modalities.models.model import SwiGLU
 from modalities.models.utils import ModelTypeEnum, get_model_from_config
@@ -27,26 +28,6 @@ def convert_model_checkpoint(modalities_config: dict) -> tuple[GPT2ForCausalLM,
     return hf_model, modalities_model
 
 
-def _check_conversion_criteria(model_config: dict) -> None:
-    """Checks that the modalities config fulfills criteria necessary for conversion
-
-    Args:
-        model_config (dict): model or model_raw part of the Modalities config dictionary.
-
-    Returns:
-        None
-    """
-    assert model_config["poe_type"] == PositionTypes.NOPE
-    assert model_config["bias"] is False
-    assert model_config["activation_type"] == "swiglu"
-    assert model_config["attention_implementation"] in ["pytorch_flash", "manual"]
-
-    for norm in ["attention_norm", "ffn_norm", "lm_head_norm"]:
-        assert model_config[norm]["variant_key"] == "layer_norm"
-        assert model_config[norm]["config"].get("elementwise_affine", True) is True  # True = default setting
-        assert model_config[norm]["config"].get("bias", True) is True  # True = default setting
-
-
 def convert_model_config(modalities_config: dict) -> GPT2Config:
     """Converts the modalities model configuration to a Huggingface transformers configuration.
        For this the model_raw or model section of the modalities config is used.
@@ -59,16 +40,8 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
         GPT2Config: Converted Huggingface model configuration.
     """
     config = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]
-
     _check_conversion_criteria(config)
 
-    if config["attention_implementation"] == "pytorch_flash":
-        attention_impl = "sdpa"
-    elif config["attention_implementation"] == "manual":
-        attention_impl = "eager"
-    else:
-        raise ValueError(f"Unknown or unsupported attention implementation {config['attention_implementation']}.")
-
     return GPT2Config(
         vocab_size=config["vocab_size"],
         hidden_size=config["n_embd"],
@@ -80,15 +53,12 @@ def convert_model_config(modalities_config: dict) -> GPT2Config:
         attention_bias=config["bias"],
         mlp_bias=config["bias"],
         hidden_act="silu",
-        layer_norm_eps=config["ffn_norm"]["config"]["eps"],
-        layer_norm_elementwise_affine=config["ffn_norm"]["config"].get(
-            "elementwise_affine",
-            True,
-        ),
-        layer_norm_bias=config["ffn_norm"]["config"].get("bias", True),
+        layer_norm_eps=_get_layer_norm_value(config["ffn_norm"]["config"], "eps"),
+        layer_norm_elementwise_affine=_get_layer_norm_value(config["ffn_norm"]["config"], "elementwise_affine"),
+        layer_norm_bias=_get_layer_norm_value(config["ffn_norm"]["config"], "bias"),
         max_position_embeddings=config["sequence_length"],
         rope_theta=config["attention_config"]["qkv_transforms"][0]["config"]["base_freq"],
-        _attn_implementation=attention_impl,
+        _attn_implementation=_map_attention_type(config),
         output_attentions=False,
     )
 
@@ -114,7 +84,50 @@ def check_converted_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM,
         assert torch.equal(llama_logits, modalities_logits)
 
 
-def _copy_weights_model(hf_model_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
+def _check_conversion_criteria(model_config: dict) -> None:
+    """Checks that the modalities config fulfills criteria necessary for conversion
+
+    Args:
+        model_config (dict): model or model_raw part of the Modalities config dictionary.
+
+    Returns:
+        None
+    """
+    assert model_config["poe_type"] == PositionTypes.NOPE
+    assert model_config["activation_type"] == "swiglu"
+    assert model_config["attention_implementation"] in ["pytorch_flash", "manual"]
+
+    norms = ["attention_norm", "ffn_norm", "lm_head_norm"]
+    for norm in norms:
+        assert model_config[norm]["variant_key"] == "layer_norm"
+
+    assert (
+        len(set(_get_layer_norm_value(model_config[norm]["config"], "bias") for norm in norms)) == 1
+    ), "All norms must have the same bias setting."
+    assert (
+        len(set(_get_layer_norm_value(model_config[norm]["config"], "elementwise_affine") for norm in norms)) == 1
+    ), "All norms must have the same elementwise_affine setting."
+    assert (
+        len(set(_get_layer_norm_value(model_config[norm]["config"], "eps") for norm in norms)) == 1
+    ), "All norms must have the same eps setting."
+
+
+def _get_layer_norm_value(config: dict, field: str) -> bool | float | int:
+    default = LayerNormConfig.model_fields[field].default
+    return config.get(field, default)
+
+
+def _map_attention_type(config):
+    if config["attention_implementation"] == "pytorch_flash":
+        attention_impl = "sdpa"
+    elif config["attention_implementation"] == "manual":
+        attention_impl = "eager"
+    else:
+        raise ValueError(f"Unknown or unsupported attention implementation {config['attention_implementation']}.")
+    return attention_impl
+
+
+def _copy_weights_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
     """Copies the weights of the modalities model to the Huggingface transformers model.
 
     Args:
@@ -122,13 +135,13 @@ def _copy_weights_model(hf_model_model: GPT2ForCausalLM, modalities_model: GPT2L
                                           The weights will be copied here.
         modalities_model (GPT2LLM): The modalities model from which the weights will be copied.
     """
-    hf_model_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)
-    for hf_layer, modalities_layer in zip(hf_model_model.model.layers, modalities_model.transformer.h):
+    hf_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)
+    for hf_layer, modalities_layer in zip(hf_model.model.layers, modalities_model.transformer.h):
         _copy_weights_attention(hf_layer, modalities_layer)
         _copy_weights_mlp(hf_layer, modalities_layer)
         _copy_weights_layer_norms(hf_layer, modalities_layer)
-    _copy_weights_base_modules(hf_model_model.lm_head, modalities_model.lm_head)
-    _copy_weights_base_modules(hf_model_model.model.norm, modalities_model.transformer.lm_head_norm)
+    _copy_weights_base_modules(hf_model.lm_head, modalities_model.lm_head)
+    _copy_weights_base_modules(hf_model.model.norm, modalities_model.transformer.lm_head_norm)
 
 
 def _copy_weights_attention(hf_layer: GPT2DecoderLayer, modalities_layer: GPT2Block):
diff --git a/src/modalities/conversion/gpt2/convert_gpt2.py b/src/modalities/conversion/gpt2/convert_gpt2.py
index 693875a39..bb2bc1d3b 100644
--- a/src/modalities/conversion/gpt2/convert_gpt2.py
+++ b/src/modalities/conversion/gpt2/convert_gpt2.py
@@ -20,6 +20,7 @@
 
 import argparse
 import os
+from pathlib import Path
 
 from modalities.config.config import load_app_config_dict
 from modalities.conversion.gpt2.conversion_code import transfer_model_code
@@ -45,7 +46,7 @@ def convert_gpt2(
         device_modalities (str, optional): Device for the modalities model. Defaults to "cpu".
         device_hf (str, optional): Device for the Hugging Face model. Defaults to "cpu".
     """
-    modalities_config = load_app_config_dict(modalities_config_path)
+    modalities_config = load_app_config_dict(Path(modalities_config_path))
     hf_model, modalities_model = convert_model_checkpoint(modalities_config)
 
     if num_testruns > 0:
diff --git a/tests/conversion/gpt2/test_conversion_code.py b/tests/conversion/gpt2/test_conversion_code.py
index 72d04b2e6..e24dd74fd 100644
--- a/tests/conversion/gpt2/test_conversion_code.py
+++ b/tests/conversion/gpt2/test_conversion_code.py
@@ -6,19 +6,19 @@
 def test_modeling_gpt2_gets_transferred_with_model_files(tmp_path: Path):
     modeling_gpt2_path = tmp_path / "modeling_gpt2.py"
     assert not modeling_gpt2_path.exists()
-    transfer_model_code(tmp_path)
+    transfer_model_code(str(tmp_path))
     assert modeling_gpt2_path.exists()
 
 
 def test_configuration_gpt2_gets_transferred_with_model_files(tmp_path: Path):
     configuration_gpt2_path = tmp_path / "configuration_gpt2.py"
     assert not configuration_gpt2_path.exists()
-    transfer_model_code(tmp_path)
+    transfer_model_code(str(tmp_path))
     assert configuration_gpt2_path.exists()
 
 
 def test_transferred_modeling_gpt2_does_not_import_from_modalities(tmp_path: Path):
-    transfer_model_code(tmp_path)
+    transfer_model_code(str(tmp_path))
     with open(tmp_path / "modeling_gpt2.py") as f:
         text = f.read()
         assert "from modalities" not in text
@@ -26,7 +26,7 @@ def test_transferred_modeling_gpt2_does_not_import_from_modalities(tmp_path: Pat
 
 
 def test_transferred_configuration_gpt2_does_not_import_from_modalities(tmp_path: Path):
-    transfer_model_code(tmp_path)
+    transfer_model_code(str(tmp_path))
     with open(tmp_path / "configuration_gpt2.py") as f:
         text = f.read()
         assert "from modalities" not in text
diff --git a/tests/conversion/gpt2/test_convert_gpt2.py b/tests/conversion/gpt2/test_convert_gpt2.py
index c7cbd573c..97201da24 100644
--- a/tests/conversion/gpt2/test_convert_gpt2.py
+++ b/tests/conversion/gpt2/test_convert_gpt2.py
@@ -28,4 +28,6 @@ def test_converting_gpt2_does_not_change_outputs(tmp_path: Path, gpt2_config_pat
         output_dir, local_files_only=True, trust_remote_code=True
     ).to(dtype=torch.bfloat16)
     vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
-    check_converted_model(converted_model, original_model, 1, vocab_size)
+    check_converted_model(
+        hf_model=converted_model, modalities_model=original_model, num_testruns=1, vocab_size=vocab_size
+    )
diff --git a/tests/conversion/test_configs/gpt2_config_test.yaml b/tests/conversion/test_configs/gpt2_config_test.yaml
index 56a5b9bfa..ca98b980d 100644
--- a/tests/conversion/test_configs/gpt2_config_test.yaml
+++ b/tests/conversion/test_configs/gpt2_config_test.yaml
@@ -4,14 +4,14 @@ model:
   config:
     sample_key: input_ids
     poe_type: NOPE
-    sequence_length: 256
+    sequence_length: 128
     prediction_key: logits
     vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
-    n_layer: 4
-    n_head_q: 12
-    n_head_kv: 12
-    ffn_hidden: 1024
-    n_embd: 768
+    n_layer: 3
+    n_head_q: 4
+    n_head_kv: 4
+    ffn_hidden: 512
+    n_embd: 256
     dropout: 0.0
     bias: false # True: bias in Linears, like GPT-2. False: a bit better and faster
     attention_config:
@@ -37,7 +37,7 @@ model:
       config:
         normalized_shape: ${model.config.n_embd}
         eps: 1e-5
-        bias: false
+        bias: true
     lm_head_norm:
       component_key: layer_norm
       variant_key: layer_norm

From 9707027d152aae714ad0fdb288dd78db580d507d Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Tue, 4 Mar 2025 14:19:00 +0100
Subject: [PATCH 26/28] docs(huggingface): small fix

---
 src/modalities/conversion/gpt2/conversion_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py
index 687a3dacf..1ec229f60 100644
--- a/src/modalities/conversion/gpt2/conversion_model.py
+++ b/src/modalities/conversion/gpt2/conversion_model.py
@@ -131,8 +131,8 @@ def _copy_weights_model(hf_model: GPT2ForCausalLM, modalities_model: GPT2LLM):
     """Copies the weights of the modalities model to the Huggingface transformers model.
 
     Args:
-        hf_model_model (GPT2ForCausalLM): The uninitialized Huggingface transformers model.
-                                          The weights will be copied here.
+        hf_model (GPT2ForCausalLM): The uninitialized Huggingface transformers model.
+                                    The weights will be copied here.
         modalities_model (GPT2LLM): The modalities model from which the weights will be copied.
     """
     hf_model.model.embed_tokens.weight.data.copy_(modalities_model.transformer.wte.weight.data)

From 60bab8a9974bb8c6099457395cd4bb84532a1476 Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Wed, 5 Mar 2025 10:43:45 +0100
Subject: [PATCH 27/28] refactor(huggingface): missing type hint

---
 src/modalities/conversion/gpt2/conversion_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/modalities/conversion/gpt2/conversion_model.py b/src/modalities/conversion/gpt2/conversion_model.py
index 1ec229f60..67e045262 100644
--- a/src/modalities/conversion/gpt2/conversion_model.py
+++ b/src/modalities/conversion/gpt2/conversion_model.py
@@ -117,7 +117,7 @@ def _get_layer_norm_value(config: dict, field: str) -> bool | float | int:
     return config.get(field, default)
 
 
-def _map_attention_type(config):
+def _map_attention_type(config: dict):
     if config["attention_implementation"] == "pytorch_flash":
         attention_impl = "sdpa"
     elif config["attention_implementation"] == "manual":

From 000a9fa6b2f573c57ef0aa9060377a81ba17fcda Mon Sep 17 00:00:00 2001
From: Timm Ruland <timm.heine.ruland@iais.fraunhofer.de>
Date: Thu, 6 Mar 2025 09:21:35 +0100
Subject: [PATCH 28/28] test(huggingface): Added one additional test and
 refactored some others.

---
 tests/conversion/gpt2/conftest.py             |  5 +-
 .../conversion/gpt2/test_conversion_model.py  |  6 +++
 tests/conversion/gpt2/test_convert_gpt2.py    | 52 +++++++++++++------
 3 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/tests/conversion/gpt2/conftest.py b/tests/conversion/gpt2/conftest.py
index 3deaf3364..36280c92a 100644
--- a/tests/conversion/gpt2/conftest.py
+++ b/tests/conversion/gpt2/conftest.py
@@ -11,8 +11,9 @@
 from tests.conftest import _ROOT_DIR
 
 
-@pytest.fixture()
-def gpt2_config_path(tmp_path: Path, initialized_model: GPT2LLM, config_file_path: str) -> str:
+@pytest.fixture
+def gpt2_config_path(tmpdir_factory: pytest.TempdirFactory, initialized_model: GPT2LLM, config_file_path: str) -> str:
+    tmp_path = tmpdir_factory.mktemp("gpt2_model")
     new_config_filename = tmp_path / "gpt2_config_test.yaml"
     model_path = tmp_path / "model.pth"
     shutil.copy(config_file_path, new_config_filename)
diff --git a/tests/conversion/gpt2/test_conversion_model.py b/tests/conversion/gpt2/test_conversion_model.py
index b75223dbf..b75adaba0 100644
--- a/tests/conversion/gpt2/test_conversion_model.py
+++ b/tests/conversion/gpt2/test_conversion_model.py
@@ -11,6 +11,12 @@
 from tests.conversion.gpt2.helper import check_same_weight_base_modules, check_same_weight_model
 
 
+def test_convert_model_can_generate(gpt2_config_path: str):
+    modalities_config = load_app_config_dict(gpt2_config_path)
+    hf_model, _ = convert_model_checkpoint(modalities_config)
+    assert hf_model.can_generate()
+
+
 def test_convert_model_checkpoint_does_not_change_weights(gpt2_config_path: str):
     modalities_config = load_app_config_dict(gpt2_config_path)
     hf_model, modalities_model = convert_model_checkpoint(modalities_config)
diff --git a/tests/conversion/gpt2/test_convert_gpt2.py b/tests/conversion/gpt2/test_convert_gpt2.py
index 97201da24..9764c8027 100644
--- a/tests/conversion/gpt2/test_convert_gpt2.py
+++ b/tests/conversion/gpt2/test_convert_gpt2.py
@@ -1,33 +1,53 @@
 from pathlib import Path
 
+import pytest
 import torch
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, PreTrainedModel
 
 from modalities.config.config import load_app_config_dict
 from modalities.conversion.gpt2.conversion_model import check_converted_model
 from modalities.conversion.gpt2.convert_gpt2 import convert_gpt2
+from modalities.models.gpt2.gpt2_model import GPT2LLM
 from modalities.models.utils import ModelTypeEnum, get_model_from_config
 from tests.conversion.gpt2.helper import check_same_weight_model
 
 
-def test_converting_gpt2_does_not_change_weights(tmp_path: Path, gpt2_config_path: str):
-    output_dir = tmp_path / "output"
-    convert_gpt2(gpt2_config_path, output_dir)
-    modalities_config = load_app_config_dict(gpt2_config_path)
-    original_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
-    converted_model = AutoModelForCausalLM.from_pretrained(output_dir, local_files_only=True, trust_remote_code=True)
+def test_converting_gpt2_does_not_change_weights(converted_model: PreTrainedModel, original_model: GPT2LLM):
     check_same_weight_model(converted_model, original_model)
 
 
-def test_converting_gpt2_does_not_change_outputs(tmp_path: Path, gpt2_config_path: str):
-    output_dir = tmp_path / "output"
-    convert_gpt2(gpt2_config_path, output_dir)
-    modalities_config = load_app_config_dict(gpt2_config_path)
-    original_model = get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
-    converted_model = AutoModelForCausalLM.from_pretrained(
-        output_dir, local_files_only=True, trust_remote_code=True
-    ).to(dtype=torch.bfloat16)
-    vocab_size = modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
+def test_converting_gpt2_does_not_change_outputs(
+    converted_model: PreTrainedModel, original_model: GPT2LLM, vocab_size: int
+):
     check_converted_model(
         hf_model=converted_model, modalities_model=original_model, num_testruns=1, vocab_size=vocab_size
     )
+
+
+@pytest.fixture
+def converted_model(run_convert_gpt2: None, output_dir: Path) -> PreTrainedModel:
+    return AutoModelForCausalLM.from_pretrained(output_dir, local_files_only=True, trust_remote_code=True).to(
+        dtype=torch.bfloat16
+    )
+
+
+@pytest.fixture
+def run_convert_gpt2(gpt2_config_path: str, output_dir: Path):
+    convert_gpt2(gpt2_config_path, output_dir)
+
+
+@pytest.fixture
+def original_model(gpt2_config_path: str) -> GPT2LLM:
+    modalities_config = load_app_config_dict(gpt2_config_path)
+    return get_model_from_config(modalities_config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL)
+
+
+@pytest.fixture
+def vocab_size(gpt2_config_path: str) -> int:
+    modalities_config = load_app_config_dict(gpt2_config_path)
+    return modalities_config["model_raw" if "model_raw" in modalities_config else "model"]["config"]["vocab_size"]
+
+
+@pytest.fixture
+def output_dir(tmp_path: Path) -> Path:
+    return tmp_path / "output"