From 55cfa1b35a693d578f33bcc08298dece4ad5cb83 Mon Sep 17 00:00:00 2001
From: leon-seidel <83984854+leon-seidel@users.noreply.github.com>
Date: Mon, 27 Jan 2025 18:34:24 +0100
Subject: [PATCH 01/15] Add Idefics3/SmolVLM quant support via traceable class
 (#1095)

SUMMARY:
Adding a traceable Idefics3 class following the new
[guide](https://github.com/vllm-project/llm-compressor/blob/main/src/llmcompressor/transformers/tracing/GUIDE.md)
to allow W4A16 quants of Idefics3 and SmolVLM (which share the same
architecture). Idefics3 seems to require a max_sequence_length of 4096
and I copied the example from the Phi 3 Vision example as the dataset
loading approach from the Llava example led to OOM on 64 GB RAM.

TEST PLAN:
Tested on A100 with Idefics3 @512 samples and on a 4060 Ti with SmolVLM
@128 samples.

---------

Co-authored-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../multimodal_vision/idefics3_example.py     | 117 +++++
 .../transformers/tracing/__init__.py          |   4 +
 .../transformers/tracing/idefics3.py          | 424 ++++++++++++++++++
 3 files changed, 545 insertions(+)
 create mode 100644 examples/multimodal_vision/idefics3_example.py
 create mode 100644 src/llmcompressor/transformers/tracing/idefics3.py

diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
new file mode 100644
index 000000000..2a3934d15
--- /dev/null
+++ b/examples/multimodal_vision/idefics3_example.py
@@ -0,0 +1,117 @@
+import requests
+import torch
+from datasets import load_dataset
+from PIL import Image
+from transformers import AutoProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import TraceableIdefics3ForConditionalGeneration
+
+# Load model.
+model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
+model = TraceableIdefics3ForConditionalGeneration.from_pretrained(
+    model_id, device_map="auto", torch_dtype="auto"
+)
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+# Oneshot arguments
+DATASET_ID = "lmms-lab/flickr30k"
+DATASET_SPLIT = "test[:512]"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 4096  # Seems to be required here
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
+# Recipe
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["LlamaDecoderLayer"],
+        ignore=["re:.*lm_head", "re:model.vision_model.*", "re:model.connector.*"],
+    ),
+]
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+# Apply chat template
+def preprocess(example):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What does the image show?"},
+                {"type": "image"},
+            ],
+        }
+    ]
+    return {
+        "text": processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+        ),
+        "images": example["image"],
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return processor(
+        text=sample["text"],
+        images=sample["images"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+    )
+
+
+# avoid errors with writer_batch_size
+ds = ds.map(tokenize, writer_batch_size=1, remove_columns=ds.column_names)
+
+# Perform oneshot
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+    data_collator=data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py
index fae57dbb1..d8df42c93 100644
--- a/src/llmcompressor/transformers/tracing/__init__.py
+++ b/src/llmcompressor/transformers/tracing/__init__.py
@@ -7,9 +7,13 @@
 from .qwen2_vl import (
     Qwen2VLForConditionalGeneration as TraceableQwen2VLForConditionalGeneration,
 )
+from .idefics3 import (
+    Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration
+)
 
 __all__ = [
     "TraceableLlavaForConditionalGeneration",
     "TraceableMllamaForConditionalGeneration",
     "TraceableQwen2VLForConditionalGeneration",
+    "TraceableIdefics3ForConditionalGeneration"
 ]
diff --git a/src/llmcompressor/transformers/tracing/idefics3.py b/src/llmcompressor/transformers/tracing/idefics3.py
new file mode 100644
index 000000000..8c61ba45e
--- /dev/null
+++ b/src/llmcompressor/transformers/tracing/idefics3.py
@@ -0,0 +1,424 @@
+# flake8: noqa
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# vllm-project: no copyright
+"""PyTorch Idefics3 model."""
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.utils import logging
+# from transformers.models.auto import AutoModel
+from transformers.models.idefics3.configuration_idefics3 import Idefics3Config
+from transformers.models.idefics3.modeling_idefics3 import (
+    Idefics3Model,
+    Idefics3ForConditionalGeneration, 
+    Idefics3VisionTransformer, 
+    Idefics3Connector,
+    Idefics3BaseModelOutputWithPast
+)
+from transformers.models.llama.modeling_llama import (
+    LlamaModel,
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaConfig,
+    LlamaRMSNorm, 
+    LlamaMLP, 
+    LlamaRotaryEmbedding, 
+    apply_rotary_pos_emb, 
+    eager_attention_forward
+    )
+
+from typing import Callable
+from transformers.processing_utils import Unpack
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.cache_utils import StaticCache
+
+logger = logging.get_logger(__name__)
+
+
+# TRACING: cannot condition on mask shape
+@torch.fx.wrap
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    cache_position: torch.Tensor,
+    batch_size: int,
+    **kwargs,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+            `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache,
+            to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        min_dtype = torch.finfo(dtype).min
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+        )
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+    return causal_mask
+
+
+class LlamaAttention(LlamaAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        # TRACING: Use input_shape[0], input_shape[1] instead of *input_shape
+        hidden_shape = (input_shape[0], input_shape[1], -1, self.head_dim) #(*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(input_shape[0], input_shape[1], -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.hidden_size = config.hidden_size
+        # TRACING: Use custom LlamaAttention
+        self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class LlamaModel(LlamaModel):
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        # TRACING: Use custom LlamaDecoderLayer
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+    
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        # TRACING: Use wrapped _prepare_4d_causal_attention_mask_with_cache_position
+        causal_mask = _prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+class Idefics3Model(Idefics3Model):
+    def __init__(self, config: Idefics3Config):
+        super().__init__(config)
+        self.padding_idx = self.config.text_config.pad_token_id
+        self.vocab_size = self.config.text_config.vocab_size
+
+        self.vision_model = Idefics3VisionTransformer._from_config(config.vision_config)
+        self.connector = Idefics3Connector(config)
+        # TRACING: Use traceable LlamaModel
+        self.text_model = LlamaModel(config.text_config) # AutoModel.from_config(config.text_config)
+
+        self.image_seq_len = int(
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2)
+        )
+        self.image_token_id = self.config.image_token_id
+
+        self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2"
+
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Idefics3BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training and self.text_model.gradient_checkpointing and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_seen_tokens = 0
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            past_seen_tokens = past_key_values.get_seq_length()
+
+        if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
+            raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(self.device)
+
+        # START VISUAL INPUTS INTEGRATION
+        if pixel_values is not None and image_hidden_states is not None:
+            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
+        elif pixel_values is not None:
+            batch_size, num_images, num_channels, height, width = pixel_values.shape
+            pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+            # TRACING: Use pixel_values.shape[2], pixel_values.shape[3], pixel_values.shape[4] instead of *pixel_values.shape[2:]
+            pixel_values = pixel_values.view(batch_size * num_images, pixel_values.shape[2], pixel_values.shape[3], pixel_values.shape[4])
+
+            # Remove padding images - padding images are full 0.
+            nb_values_per_image = pixel_values.shape[1:].numel()
+            real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+            pixel_values = pixel_values[real_images_inds].contiguous()
+
+            # Handle the vision attention mask
+            if pixel_attention_mask is None:
+                pixel_attention_mask = torch.ones(
+                    size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)),
+                    dtype=torch.bool,
+                    device=pixel_values.device,
+                )
+            else:
+                # Remove padding images from the mask
+                # TRACING: Use pixel_attention_mask.shape[2], pixel_attention_mask.shape[3] instead of *pixel_attention_mask.shape[2:]
+                pixel_attention_mask = pixel_attention_mask.view(
+                    batch_size * num_images, pixel_attention_mask.shape[2], pixel_attention_mask.shape[3]
+                )
+                pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+
+            patch_size = self.config.vision_config.patch_size
+            patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+            patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+            patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+            # Get sequence from the vision encoder
+            image_hidden_states = self.vision_model(
+                pixel_values=pixel_values,
+                patch_attention_mask=patch_attention_mask,
+            ).last_hidden_state
+
+            # Modality projection & resampling
+            image_hidden_states = self.connector(image_hidden_states)
+
+        elif image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
+
+        if past_seen_tokens == 0 and inputs_embeds is not None and image_hidden_states is not None:
+            # When we generate, we don't want to replace the potential image_token_id that we generated by images
+            # that simply don't exist
+            inputs_embeds = self.inputs_merger(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                image_hidden_states=image_hidden_states,
+            )
+
+        outputs = self.text_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
+
+        return Idefics3BaseModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_hidden_states,
+        )
+
+
+class Idefics3ForConditionalGeneration(Idefics3ForConditionalGeneration):
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.__init__ with Idefics2->Idefics3
+    def __init__(self, config):
+        super().__init__(config)
+        # TRACING: Use custom Idefics3Model
+        self.model = Idefics3Model(config)
+        self.image_token_id = self.config.image_token_id
+
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Initialize weights and apply final processing
+        self.post_init()

From a9b8654f05514a256f057cdceb31f3fa7ef69ec0 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 27 Jan 2025 13:55:52 -0500
Subject: [PATCH 02/15] Traceability Guide: Clarity and typo (#1099)

## Purpose ##
* Reword some things and fix typo in tracing guide

## Changes ##
* No need to wrap numbers with code font
* Activations are used to calibrate the hessian, not recorded into the
hessian
* Add close bracket for emphasis tag, which was causing the rest of the
doc to be italicized

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 src/llmcompressor/transformers/tracing/GUIDE.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/llmcompressor/transformers/tracing/GUIDE.md b/src/llmcompressor/transformers/tracing/GUIDE.md
index 193bfd7b7..887881342 100644
--- a/src/llmcompressor/transformers/tracing/GUIDE.md
+++ b/src/llmcompressor/transformers/tracing/GUIDE.md
@@ -16,14 +16,14 @@ a [Sequential Pipeline](/src/llmcompressor/pipelines/sequential/pipeline.py)
 is required in order to offload activations and reduce memory usage as well as propagate
 the activation error induced by compression.
 
-For example, let's say we want to quantize a basic `3` layer model using the
-[GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) and `512`
+For example, let's say we want to quantize a basic 3 layer model using the
+[GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) and 512
 calibration samples. The [Sequential Pipeline](/src/llmcompressor/pipelines/sequential/pipeline.py)
 first identifies each of the layers (sequential targets) within the model. Then, the
-pipeline runs each of the `512` samples, one sample at a time, through the first layer.
+pipeline runs each of the 512 samples, one sample at a time, through the first layer.
 When one sample completes its forward pass through the layer, its activations are
-recorded by the [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py)
-hessian and the layer output is offloaded to the cpu. After all `512` samples have been
+used by the [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py)
+to calibrate the hessian and the layer output is offloaded to the cpu. After all 512 samples have been
 passed through the layer, the [GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py)
 uses the recorded activations to compress the weights of the modules within the layer.
 Once module compression is complete, the offloaded activations are used to perform the
@@ -242,7 +242,7 @@ def _prepare_cross_attention_mask(...) -> ...:
     <img alt="Wrapped Function" src="assets/wrapped_function.jpg" height="5%" />
 </p>
 <p align="center">
-    <em>This image dicts how the internals of the <code>_prepare_cross_attention_mask</code> function are replaced by a single <code>call_module</code> operation, similar to how modules can be ignored as featured in section 1
+    <em>This image dicts how the internals of the <code>_prepare_cross_attention_mask</code> function are replaced by a single <code>call_module</code> operation, similar to how modules can be ignored as featured in section 1</em>
 </p>
 
 Please note that wrapped functions must be defined at the module-level, meaning that

From 6fa5a5eecc7d363ec73474d011d40135b6374179 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 27 Jan 2025 16:54:22 -0500
Subject: [PATCH 03/15] [VLM] Examples README (#1057)

## Purpose ##
* Create a landing page for those looking to use VLMs
* Advertise VLM support on homepage

## Prerequisites ##
* #1030

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
---
 README.md                            |  1 +
 examples/multimodal_vision/README.md | 64 ++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 examples/multimodal_vision/README.md

diff --git a/README.md b/README.md
index fd7f2f3e3..9ba3caae3 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ Applying quantization with `llmcompressor`:
 * [Activation quantization to `fp8`](examples/quantization_w8a8_fp8)
 * [Weight only quantization to `int4`](examples/quantization_w4a16)
 * [Quantizing MoE LLMs](examples/quantizing_moe)
+* [Quantizing Multimodal VLMs](examples/multimodal_vision)
 
 ### User Guides
 Deep dives into advanced usage of `llmcompressor`:
diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md
new file mode 100644
index 000000000..69f31ffb0
--- /dev/null
+++ b/examples/multimodal_vision/README.md
@@ -0,0 +1,64 @@
+# Quantizing Multimodal Vision-Language Models #
+
+<p align="center" style="text-align: center;">
+    <img src=http://images.cocodataset.org/train2017/000000231895.jpg alt="sample image from MS COCO dataset"/>
+</p>
+<em>
+
+``` 
+<|system|>
+You are a helpful assistant.
+
+<|user|>
+Please describe the animal in this image
+
+<|assistant|>
+The animal in the image is a white kitten.
+It has a fluffy coat and is resting on a white keyboard.
+The kitten appears to be comfortable and relaxed, possibly enjoying the warmth of the keyboard.
+```
+</em>
+
+This directory contains example scripts for quantizing a variety of vision-language models using the GPTQ quantization. Most examples do not demonstrate quantizing separate vision encoder parameters if they exist, as compressing these parameters offers little benefit with respect to performance-accuracy tradeoff.
+
+## Compressing Your Own Model ##
+To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
+```python3
+model_id = "path/to/your/model"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="auto",
+)
+```
+
+## Customizing GPTQModifier Parameters ##
+The GPTQModifier is the modifier responsible for performing quantization of the model weights. For more information on quantizing with different weight schemes, see the `quantization_` examples in the [examples folder](/examples/).
+
+```python3
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["MistralDecoderLayer"],
+        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+    ),
+]
+```
+
+### Sequential Targets ###
+Sequential targets are the modules which determine the granularity of error propagation and activation offloading when performing forward passes of the model. These are typically the "transformer blocks" of the model, also referred to as "layers" with llm-compressor.
+
+Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
+
+### Ignore ###
+If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
+
+## Tracing Errors ##
+Because the architectures of vision-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
+
+## Adding Your Own Smoothquant Mappings ##
+For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
+
+## Adding Your Own Data Collator ##
+Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
\ No newline at end of file

From b61092b8e398cb6b5a7137b787a5d58e573a1d11 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Tue, 28 Jan 2025 14:13:02 -0600
Subject: [PATCH 04/15] Raise warning for 24 compressed sparse-only models
 (#1107)

In a recent update, we disabled Cutlass kernels for sparse-only models
https://github.com/vllm-project/vllm/pull/12417. As a result,
sparse-24-only compressed-models are no longer runnable in vLLM.

This PR introduces a warning message to inform users when compression is
enabled in scenarios where sparse-only models are unsupported. This
ensures clarity and avoids unexpected behavior when using sparse-24
configurations with vLLM.

Changes:

- Added a warning to notify users when attempting to enable compression
with sparse-only models in unsupported configurations.

---------

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
---
 .../transformers/compression/sparsity_config.py             | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/transformers/compression/sparsity_config.py b/src/llmcompressor/transformers/compression/sparsity_config.py
index eb4b5f18c..e048ac838 100644
--- a/src/llmcompressor/transformers/compression/sparsity_config.py
+++ b/src/llmcompressor/transformers/compression/sparsity_config.py
@@ -181,7 +181,11 @@ def is_sparse24_bitmask_supported(
             return False
 
         if not is_model_quantized(model):
-            # non-quantized 2:4 sparse models are supported
+            logger.warning(
+                "Compressed Sparse-only 2:4 models are not supported in vLLM<=0.7.0, "
+                "consider saving with `disable_sparse_compression` set, "
+                "`model.save_pretrained(..., disable_sparse_compression=True)`"
+            )
             return True
 
         # when model is quantized, and has 2:4 sparsity

From 8764291119709f7585028cf87f0500b6b8107d74 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 10:33:48 -0500
Subject: [PATCH 05/15] Remove log_model_load (#1016)

## Purpose ##
* Remove unused code that is unlikely to be used in the future, since
we're now using default transformers autoclasses to load quantized
models

## Changes ##
* Remove `log_model_load`, since we now prefer to load as run_compressed

## Testing ##
`grep -r 'log_model_load' src/ tests/ examples/`

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../pytorch/model_load/helpers.py             | 41 -------------------
 1 file changed, 41 deletions(-)

diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
index a9ecb67a7..5ddc7ebd5 100644
--- a/src/llmcompressor/pytorch/model_load/helpers.py
+++ b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -8,13 +8,11 @@
 from torch.nn import Module
 
 from llmcompressor.core import active_session, create_session, pre_initialize_structure
-from llmcompressor.pytorch.utils import ModuleSparsificationInfo
 from llmcompressor.typing import Processor
 
 COMPLETED_STAGES_FILENAME = "completed_stages.json"
 
 __all__ = [
-    "log_model_load",
     "initialize_recipe",
     "save_model_and_recipe",
     "copy_python_files_from_model_cache",
@@ -26,45 +24,6 @@
 ]
 
 
-def log_model_load(
-    model: Module, model_name_or_path: str, model_type: str, delayed_load: bool
-):
-    """
-    Log the state of a loaded model including sparsity and
-    prunable params information.
-
-    :param model: the loaded model
-    :param model_name_or_path: the original name of or path to the model that loaded
-    :param model_type: specify the type of model loaded for logging;
-        ex one of [model, student, teacher]
-    :param delayed_load: True if this model load was delayed until after
-        recipe instantiation due to QAT or other architectural state changes
-    """
-    if delayed_load:
-        logger.info(
-            f"Delayed load of model {model_name_or_path} detected. "
-            f"Will print out model information once LLMCompressor recipes have loaded"
-        )
-        return
-
-    sparsification_info = ModuleSparsificationInfo(model)
-
-    logger.info(
-        f"Loaded {model_type} from {model_name_or_path} "
-        f"with {sparsification_info.params_total} total params. "
-        f"Of those there are {sparsification_info.params_prunable_total} prunable "
-        f"params which have {sparsification_info.params_prunable_sparse_percent} "
-        "avg sparsity."
-    )
-    model_type = (
-        "sparse" if sparsification_info.params_prunable_sparse_percent > 5 else "dense"
-    )
-    logger.info(
-        f"{model_type} model detected, "
-        f"all sparsification info: {sparsification_info}"
-    )
-
-
 def initialize_recipe(model: Module, recipe_path: str):
     """
     Initializes a recipe that has been previously applied to the model

From 8de50a21dc7044a5cb1e38a2029f6e8cb605b0d1 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 29 Jan 2025 14:46:25 -0600
Subject: [PATCH 06/15] Return empty sparsity config if targets and ignores are
 empty (#1115)

This PR fixes an issue where a sparsity configuration could end up being
empty under certain conditions. Specifically, if the global sparsity is
greater than 0.05, but no individual layer has a sparsity greater than
0.5, we end up with an empty sparsity config.

To address this, we now ensure that an empty sparsity config is not
added in such cases


---
- To see the specific tasks where the Asana app for GitHub is being
used, see below:
  - https://app.asana.com/0/0/1209272443107638

Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
---
 .../transformers/compression/sparsity_config.py              | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/llmcompressor/transformers/compression/sparsity_config.py b/src/llmcompressor/transformers/compression/sparsity_config.py
index e048ac838..d35ddadd1 100644
--- a/src/llmcompressor/transformers/compression/sparsity_config.py
+++ b/src/llmcompressor/transformers/compression/sparsity_config.py
@@ -130,6 +130,11 @@ def from_pretrained(
             sparsity_threshold=SparsityConfigMetadata.SPARSITY_THRESHOLD,
         )
 
+        if not (targets or ignores):
+            # no sparsity config
+            # needed if targets/ignores are empty
+            return None
+
         return SparsityCompressionConfig.load_from_registry(
             format,
             global_sparsity=global_sparsity,

From e32c8dc1c9ee20023b57eeb00a5aa5274160bdf4 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:08:41 -0500
Subject: [PATCH 07/15] Remove uses of get_observer (#939)

## Purpose ##
* Be consistent about how the observer of the quantization arguments is
referenced
From compressed tensors:
```python3
def get_observer(self):
    return self.observer
```

## Postrequisites ##
* https://github.com/neuralmagic/compressed-tensors/pull/214

## Changes ##

* Remove all uses of `quantization_args.get_observer()`

## Testing ##
```
grep -r '\.get_observer()' src examples tests
```

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modifiers/quantization/cache.py       | 4 ++--
 src/llmcompressor/modifiers/quantization/calibration.py | 2 +-
 tests/llmcompressor/modifiers/calibration/test_cache.py | 2 +-
 tests/llmcompressor/observers/test_min_max.py           | 8 ++++----
 tests/llmcompressor/observers/test_mse.py               | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py
index 6277e9643..5b2be2c65 100644
--- a/src/llmcompressor/modifiers/quantization/cache.py
+++ b/src/llmcompressor/modifiers/quantization/cache.py
@@ -78,11 +78,11 @@ def update(
         """
 
         if len(self.k_observers) <= layer_idx:
-            k_observer_name = self.quantization_args.get_observer()
+            k_observer_name = self.quantization_args.observer
             k_observer = Observer.load_from_registry(
                 k_observer_name, quantization_args=self.quantization_args
             )
-            v_observer_name = self.quantization_args.get_observer()
+            v_observer_name = self.quantization_args.observer
             v_observer = Observer.load_from_registry(
                 v_observer_name, quantization_args=self.quantization_args
             )
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
index ee4ce171e..300507644 100644
--- a/src/llmcompressor/modifiers/quantization/calibration.py
+++ b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -52,7 +52,7 @@ def initialize_observer(
     quantization_args = getattr(quantization_scheme, arg_name, None)
     # dont need observers for dynamic
     if quantization_args and not quantization_args.dynamic:
-        observer = quantization_args.get_observer()
+        observer = quantization_args.observer
         observer = Observer.load_from_registry(
             observer, quantization_args=quantization_args
         )
diff --git a/tests/llmcompressor/modifiers/calibration/test_cache.py b/tests/llmcompressor/modifiers/calibration/test_cache.py
index 6ea024037..898c342f5 100644
--- a/tests/llmcompressor/modifiers/calibration/test_cache.py
+++ b/tests/llmcompressor/modifiers/calibration/test_cache.py
@@ -28,7 +28,7 @@ def test_is_quantized_cache_singleton():
 
     args = QuantizationArgs()
     cache = QuantizedKVParameterCache(args)
-    observer = args.get_observer()
+    observer = args.observer
     observer = Observer.load_from_registry(observer, quantization_args=args)
 
     tensor = torch.tensor([1, 2, 3])
diff --git a/tests/llmcompressor/observers/test_min_max.py b/tests/llmcompressor/observers/test_min_max.py
index f23a06dba..b592579f6 100644
--- a/tests/llmcompressor/observers/test_min_max.py
+++ b/tests/llmcompressor/observers/test_min_max.py
@@ -37,7 +37,7 @@ def test_min_max_observer(symmetric, expected_scale, expected_zero_point):
     num_bits = 8
     weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric)
 
-    observer = weights.get_observer()
+    observer = weights.observer
     observer = Observer.load_from_registry(observer, quantization_args=weights)
     scale, zero_point = observer(tensor)
 
@@ -52,7 +52,7 @@ def test_min_max_observer_symmetric_scale_range():
     num_bits = 8
     weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
 
-    observer = weights.get_observer()
+    observer = weights.observer
     observer = Observer.load_from_registry(observer, quantization_args=weights)
     scale, zero_point = observer(tensor)
 
@@ -80,7 +80,7 @@ def test_min_max_observer_value_update():
     tensor = inp
     num_bits = 8
     weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
-    observer = weights.get_observer()
+    observer = weights.observer
     observer = Observer.load_from_registry(observer, quantization_args=weights)
     curr_max = 1
     curr_min = 1
@@ -107,7 +107,7 @@ def test_g_idx():
     weights = QuantizationArgs(num_bits=8, group_size=group_size)
     g_idx = make_dummy_g_idx(tensor.shape[1], group_size)
 
-    observer = weights.get_observer()
+    observer = weights.observer
     observer = Observer.load_from_registry(observer, quantization_args=weights)
     scale_g_idx, zero_point_g_idx = observer(tensor, g_idx=g_idx)
 
diff --git a/tests/llmcompressor/observers/test_mse.py b/tests/llmcompressor/observers/test_mse.py
index ec2ecf1b5..4447813b3 100644
--- a/tests/llmcompressor/observers/test_mse.py
+++ b/tests/llmcompressor/observers/test_mse.py
@@ -32,7 +32,7 @@ def test_mse_observer(symmetric, expected_scale, expected_zero_point):
     num_bits = 8
     weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric, observer="mse")
 
-    observer = weights.get_observer()
+    observer = weights.observer
     observer = Observer.load_from_registry(observer, quantization_args=weights)
     scale, zero_point = observer(tensor)
 
@@ -48,7 +48,7 @@ def test_mse_observer_symmetric_scale_range():
     num_bits = 8
     weights = QuantizationArgs(num_bits=num_bits, symmetric=True)
 
-    observer = weights.get_observer()
+    observer = weights.observer
     observer = Observer.load_from_registry(observer, quantization_args=weights)
     scale, zero_point = observer(tensor)
 

From 75a3551e1b41734c59ea0e5bbcdffa664982c558 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:09:07 -0500
Subject: [PATCH 08/15] FSDP utils cleanup (#854)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose ##
* Code cleanup
* Fix bug where FSDP and accelerate imports are coupled, meaning that
not having one will cause the other's utils to fail

## Changes ##
* Decouple `accelerate` and `fsdp` imports
* Use existing `FSDP_WRAPPED_MODULE` constant

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Signed-off-by: andy-neuma <andy@neuralmagic.com>
Signed-off-by: Rahul Tuli <rahul@neuralmagic.com>
Signed-off-by: Domenic Barbuzzi <domenic@neuralmagic.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Kyle Sayers <kylesayers@sophon-3.mynetworksettings.com>
Co-authored-by: Kyle Sayers <kyle@neuralmagic.com>
Co-authored-by: Jincheng Miao <jincheng.miao@intel.com>
Co-authored-by: 黄石 <yzlnew@gmail.com>
Co-authored-by: dhuangnm <74931910+dhuangnm@users.noreply.github.com>
Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
Co-authored-by: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>
Co-authored-by: andy-neuma <andy@neuralmagic.com>
Co-authored-by: Rahul Tuli <rahul@neuralmagic.com>
Co-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: George <george@neuralmagic.com>
---
 src/llmcompressor/utils/fsdp/context.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/llmcompressor/utils/fsdp/context.py b/src/llmcompressor/utils/fsdp/context.py
index 177b2c02f..8cc062c19 100644
--- a/src/llmcompressor/utils/fsdp/context.py
+++ b/src/llmcompressor/utils/fsdp/context.py
@@ -1,10 +1,13 @@
 try:
     from accelerate import Accelerator
+except ImportError:
+    Accelerator = None
+
+try:
     from torch.distributed.fsdp import FullyShardedDataParallel
-    from torch.distributed.fsdp._common_utils import TrainingState
+    from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE, TrainingState
 except ImportError:
     FullyShardedDataParallel = None
-    Accelerator = None
 
 from contextlib import nullcontext
 
@@ -14,8 +17,6 @@
     "fix_fsdp_module_name",
 ]
 
-FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
-
 
 def summon_full_params_context(model, offload_to_cpu: bool = False):
     if FullyShardedDataParallel is not None:
@@ -46,12 +47,15 @@ def main_process_first_context():
 def fix_fsdp_module_name(name: str) -> str:
     """
     Remove FSDP wrapper prefixes from a module name.
-    Accounts for scenario where FSDP_WRAPPER_NAME is
+    Accounts for scenario where FSDP_WRAPPED_MODULE is
     at the end of the name, as well as in the middle.
 
     :param name: name to strip
     :return: stripped name
     """
-    return name.replace(FSDP_WRAPPER_NAME + ".", "").replace(
-        "." + FSDP_WRAPPER_NAME, ""
+    if FullyShardedDataParallel is None:
+        return name
+
+    return name.replace(FSDP_WRAPPED_MODULE + ".", "").replace(
+        "." + FSDP_WRAPPED_MODULE, ""
     )

From 768be88edfb9de9837efc97cf6bc43940e10a710 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:10:58 -0500
Subject: [PATCH 09/15] Update maintainers, add notice (#1091)

## Purpose ##
* Cover legal bases w.r.t. HF code, in a similar way SparseML did

## Changes ##
* Add a notice which indicates that some code is under HF copyright
* Update maintainers list to reflect team changes

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .MAINTAINERS |  8 +++---
 NOTICE       | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 4 deletions(-)
 create mode 100644 NOTICE

diff --git a/.MAINTAINERS b/.MAINTAINERS
index 1a6355130..edc7f543c 100644
--- a/.MAINTAINERS
+++ b/.MAINTAINERS
@@ -2,11 +2,11 @@
 # uncommented maintainers will be included in code review triage
 
 markurtz
-bfineran
-rahul-tuli
-dbogunowicz
 dsikka
-Satrat
+rahul-tuli
+horheynm
+brian-dellabetta
+kylesayrs
 
 # mgoin
 # anmarques
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 000000000..f9c4e8178
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,72 @@
+LLM Compressor
+
+This product includes software developed in association with the vLLM Project (https://github.com/vllm-project).
+
+Source code in this repository is variously licensed under the Apache License
+Version 2.0, an Apache-compatible license.
+
+* For a copy of the Apache License Version 2.0, please see [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
+
+* For a copy of all other Apache-compatible licenses and notices,
+  they will be listed below.
+
+========================================================================
+NOTICES
+========================================================================
+
+Package dependencies are defined in the Python setup.py file in this repository's top-level directory and have their own Apache-compatible licenses and terms.
+
+Hugging Face Transformers License https://github.com/huggingface/transformers/blob/master/LICENSE
+
+Some model implementations subclass and include code snippets from Hugging Face Transformers.
+These snippets include and are subject to the Hugging Face Copyright and are
+provided under the Apache License, Version 2.0 https://github.com/huggingface/transformers/blob/master/LICENSE
+
+PyTorch License https://github.com/pytorch/pytorch/blob/master/LICENSE
+
+Sample images are provided under a Creative Commons Attribution License
+https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{cocodataset,
+  author    = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick},
+  title     = {Microsoft {COCO:} Common Objects in Context},
+  journal   = {CoRR},
+  volume    = {abs/1405.0312},
+  year      = {2014},
+  url       = {http://arxiv.org/abs/1405.0312},
+  archivePrefix = {arXiv},
+  eprint    = {1405.0312},
+  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+Sample audio is provided under a Creative Commons Attribution License https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{DBLP:journals/corr/abs-2111-09344,
+  author    = {Daniel Galvez and
+               Greg Diamos and
+               Juan Ciro and
+               Juan Felipe Cer{\'{o}}n and
+               Keith Achorn and
+               Anjali Gopi and
+               David Kanter and
+               Maximilian Lam and
+               Mark Mazumder and
+               Vijay Janapa Reddi},
+  title     = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition
+               Dataset for Commercial Usage},
+  journal   = {CoRR},
+  volume    = {abs/2111.09344},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2111.09344},
+  eprinttype = {arXiv},
+  eprint    = {2111.09344},
+  timestamp = {Mon, 22 Nov 2021 16:44:07 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+Other external dependencies, if referenced in this repository's various subdirectories, are subject to their associated licenses and terms.
\ No newline at end of file

From a76563ab99c0e0ebf0347846fa580b27e55df74b Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:12:29 -0500
Subject: [PATCH 10/15] Replace readme paths with urls (#1097)

## Purpose ##
* Files with the `.md` extension are not listed in the
[MANIFEST.in](https://github.com/vllm-project/llm-compressor/blob/main/MANIFEST.in),
meaning that they will not be included in the LLM Compressor pypi
package. This means that references to these files are left dangling for
users who have installed from the pypi package. Rather than including
`.md` in the package and having to also ship all the large images files
associated with them, this PR moves the references to urls hosted by
github
* While the github url paths may change between versions, this solution
works in lieu of a dedicated readthedoc build for each version
* This solution also aligns with the practice of other libraries which
point to hosted urls rather than file paths
* Note that this does not apply to files which are themselves `.md`
files, as these files will not be included in the pypi distribution
  * `src/llmcompressor/transformers/finetune/README.md`
  * `src/llmcompressor/pipelines/sequential/README.md`

## Changes ##
* Replace readme file paths with urls
* Small change to `DisableQuantization` to better catch cases where
exceptions such as tracing exceptions are triggered

## Testing ##
* N/A

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modifiers/quantization/gptq/base.py   | 3 ++-
 src/llmcompressor/modifiers/smoothquant/utils.py        | 6 ++++--
 src/llmcompressor/utils/helpers.py                      | 8 +++++---
 tests/llmcompressor/modifiers/smoothquant/test_utils.py | 5 ++++-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 5e8a6b47e..5e353d0cb 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -247,7 +247,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                 warnings.warn(
                     f"Failed to trace {model_name} with inputs {input_names}. For more "
                     "information on tracing with the sequential pipeline, see "
-                    "`src/llmcompressor/transformers/tracing/GUIDE.md`"
+                    "https://github.com/vllm-project/llm-compressor/blob/main/"
+                    "src/llmcompressor/transformers/tracing/GUIDE.md"
                 )
             if isinstance(exception, unfixable_errors):
                 raise exception
diff --git a/src/llmcompressor/modifiers/smoothquant/utils.py b/src/llmcompressor/modifiers/smoothquant/utils.py
index 83c82704e..a3af344e6 100644
--- a/src/llmcompressor/modifiers/smoothquant/utils.py
+++ b/src/llmcompressor/modifiers/smoothquant/utils.py
@@ -1,5 +1,4 @@
 import functools
-import pathlib
 from collections import namedtuple
 from typing import Dict, List, Tuple, Union
 
@@ -94,7 +93,10 @@ def wrapper(*args, **kwargs):
         try:
             return func(*args, **kwargs)
         except Exception as original_exception:
-            readme_location = pathlib.Path(__file__).parent / "README.md"
+            readme_location = (
+                "https://github.com/vllm-project/llm-compressor/tree/main/"
+                "src/llmcompressor/modifiers/smoothquant"
+            )
             raise RuntimeError(
                 f"Error resolving mappings for given architecture."
                 f"Please refer to the README at {readme_location} for more information."
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
index e6bf7b319..ad4d884b2 100644
--- a/src/llmcompressor/utils/helpers.py
+++ b/src/llmcompressor/utils/helpers.py
@@ -1091,9 +1091,11 @@ def DisableQuantization(model: torch.nn.Module):
     """
     Disable quantization from QuantizationModifier
     """
-    model.apply(disable_quantization)
-    yield
-    model.apply(enable_quantization)
+    try:
+        model.apply(disable_quantization)
+        yield
+    finally:
+        model.apply(enable_quantization)
 
 
 @contextlib.contextmanager
diff --git a/tests/llmcompressor/modifiers/smoothquant/test_utils.py b/tests/llmcompressor/modifiers/smoothquant/test_utils.py
index 95be6bd30..457b64cdb 100644
--- a/tests/llmcompressor/modifiers/smoothquant/test_utils.py
+++ b/tests/llmcompressor/modifiers/smoothquant/test_utils.py
@@ -12,7 +12,10 @@
 
 @pytest.mark.unit
 def test_handle_mapping_resolution_errors():
-    README_LOCATION = "llmcompressor/modifiers/smoothquant/README.md"
+    README_LOCATION = (
+        "https://github.com/vllm-project/llm-compressor/tree/main/"
+        "src/llmcompressor/modifiers/smoothquant"
+    )
 
     @handle_mapping_resolution_errors
     def func_that_raises_exception():

From ba8563c58fda59168da02368505e8dafeff75b88 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:12:51 -0500
Subject: [PATCH 11/15] GPTQ add Arkiv link, move file location (#1100)

## Purpose ##
* Better docstring for GPTQ
* Reduce unnecessary file hierarchy

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modifiers/quantization/gptq/base.py       | 6 ++++--
 .../quantization/gptq/{utils => }/gptq_quantize.py          | 0
 .../modifiers/quantization/gptq/utils/__init__.py           | 3 ---
 3 files changed, 4 insertions(+), 5 deletions(-)
 rename src/llmcompressor/modifiers/quantization/gptq/{utils => }/gptq_quantize.py (100%)
 delete mode 100644 src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py

diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 5e353d0cb..65e1c90e0 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -16,7 +16,7 @@
 from llmcompressor.core import State
 from llmcompressor.modifiers import Modifier, ModifierFactory
 from llmcompressor.modifiers.quantization.calibration import freeze_module_quantization
-from llmcompressor.modifiers.quantization.gptq.utils.gptq_quantize import (
+from llmcompressor.modifiers.quantization.gptq.gptq_quantize import (
     accumulate_hessian,
     make_empty_hessian,
     quantize_weight,
@@ -36,7 +36,9 @@
 
 class GPTQModifier(Modifier, HooksMixin):
     """
-    Modifier for applying the one-shot OBCQ algorithm to a model
+    Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier
+    uses activations to calibrate a hessian matrix, which is then used to determine
+    optimal quantizion values and orderings for the model weights.
 
     | Sample yaml:
     | test_stage:
diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
similarity index 100%
rename from src/llmcompressor/modifiers/quantization/gptq/utils/gptq_quantize.py
rename to src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py
diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py b/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py
deleted file mode 100644
index ec39da973..000000000
--- a/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# flake8: noqa
-
-from .gptq_quantize import *

From 507b1a403a1de513f1529f8b7ed191dceaeede0b Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:15:29 -0500
Subject: [PATCH 12/15] Extend `remove_hooks` to remove subsets (#1021)

## Purpose ##
* Allow subsets of hooks to be removed
* Not strictly needed but helps promote code clarity in the case of
wanda which adds and removes subsets of hooks at different times.

## Postrequisites ##
* https://github.com/vllm-project/llm-compressor/pull/1023
* Layer compressor deprecation

## Changes ##
* Change the datatype of `_hooks` from `List` to `Set`
* Add `handles` argument to `HooksMixin.remove_hooks`

## Testing ##
* Added `test_remove_hooks_parameterized` test

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modifiers/utils/hooks.py    | 22 +++++++++++++------
 .../modifiers/utils/test_hooks.py             | 21 ++++++++++++++++++
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/src/llmcompressor/modifiers/utils/hooks.py b/src/llmcompressor/modifiers/utils/hooks.py
index bb1755519..386d58cac 100644
--- a/src/llmcompressor/modifiers/utils/hooks.py
+++ b/src/llmcompressor/modifiers/utils/hooks.py
@@ -1,6 +1,6 @@
 import contextlib
 from functools import wraps
-from typing import Any, Callable, ClassVar, List, Union
+from typing import Any, Callable, ClassVar, Optional, Set, Union
 
 import torch
 from loguru import logger
@@ -30,7 +30,7 @@ class HooksMixin(BaseModel):
     """
 
     _HOOKS_DISABLED: ClassVar[bool] = False  # attached to global HooksMixin
-    _hooks: List[RemovableHandle] = []  # attached to local subclasses
+    _hooks: Set[RemovableHandle] = set()  # attached to local subclasses
 
     @classmethod
     @contextlib.contextmanager
@@ -70,14 +70,22 @@ def wrapped_hook(*args, **kwargs):
 
         register_function = getattr(target, f"register_{hook_type}_hook")
         handle = register_function(wrapped_hook, **kwargs)
-        self._hooks.append(handle)
+        self._hooks.add(handle)
         logger.debug(f"{self} added {handle}")
 
         return handle
 
-    def remove_hooks(self):
-        """Remove all hooks belonging to a modifier"""
-        for hook in self._hooks:
+    def remove_hooks(self, handles: Optional[Set[RemovableHandle]] = None):
+        """
+        Removes hooks registered by this modifier
+
+        :param handles: optional list of handles to remove, defaults to all hooks
+            registerd by this modifier
+        """
+        if handles is None:
+            handles = self._hooks
+
+        for hook in handles:
             hook.remove()
 
-        self._hooks = []
+        self._hooks -= handles
diff --git a/tests/llmcompressor/modifiers/utils/test_hooks.py b/tests/llmcompressor/modifiers/utils/test_hooks.py
index 5c4fc5891..df1eafedb 100644
--- a/tests/llmcompressor/modifiers/utils/test_hooks.py
+++ b/tests/llmcompressor/modifiers/utils/test_hooks.py
@@ -64,6 +64,27 @@ def test_remove_hooks():
     assert mod_a.hook_called and not mod_b.hook_called
 
 
+def test_remove_hooks_parameterized():
+    model = DummyModel()
+
+    mod_a = ModA()
+    mod_a_pre_hook = mod_a.register_hook(model.linear1, mod_a.hook, "forward_pre")
+    mod_a_post_hook = mod_a.register_hook(model.linear1, mod_a.hook, "forward")
+
+    mod_b = ModB()
+    mod_b_pre_hook = mod_b.register_hook(model.linear2, mod_b.hook, "forward_pre")
+    mod_b_post_hook = mod_b.register_hook(model.linear2, mod_b.hook, "forward")
+
+    mod_a.remove_hooks(set([mod_a_post_hook]))
+    mod_b.remove_hooks(set([mod_b_pre_hook]))
+
+    assert len(mod_a._hooks) == 1 and next(iter(mod_a._hooks)) == mod_a_pre_hook
+    assert len(mod_b._hooks) == 1 and next(iter(mod_b._hooks)) == mod_b_post_hook
+
+    model(model.dummy_inputs)
+    assert mod_a.hook_called and mod_b.hook_called
+
+
 def test_disable_hooks():
     model = DummyModel()
 

From 7fc4a6740dd7bc19285ba0aed79542305ceeb855 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:20:04 -0500
Subject: [PATCH 13/15] [Audio] Whisper Example and Readme (#1106)

## Purpose ##
* Show example of quantizing whisper audio model

## Changes ##
* Add whisper audio model example
* Add traceable whisper definition (only need to comment out a value
error check)
* The embedded audio is achieved using [github attached
files](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/attaching-files).
While there's no official word on how long these files are maintained,
if it is found that the file is deleted at some point, then we can
replace it with a link to the file uploaded to the repo.

## Testing ##
Successfully quantized whisper models and generated reasonable sample
outputs
* https://huggingface.co/nm-testing/whisper-tiny-W4A16-G128
* https://huggingface.co/nm-testing/whisper-large-v2-W4A16-G128

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 README.md                                     |   3 +-
 examples/multimodal_audio/README.md           |  88 ++++++++++
 examples/multimodal_audio/whisper_example.py  | 116 +++++++++++++
 examples/multimodal_vision/README.md          |  20 ++-
 .../modifiers/smoothquant/utils.py            |  11 ++
 .../transformers/tracing/__init__.py          |   6 +-
 .../transformers/tracing/whisper.py           | 152 ++++++++++++++++++
 7 files changed, 393 insertions(+), 3 deletions(-)
 create mode 100644 examples/multimodal_audio/README.md
 create mode 100644 examples/multimodal_audio/whisper_example.py
 create mode 100644 src/llmcompressor/transformers/tracing/whisper.py

diff --git a/README.md b/README.md
index 9ba3caae3..9021c6193 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,8 @@ Applying quantization with `llmcompressor`:
 * [Activation quantization to `fp8`](examples/quantization_w8a8_fp8)
 * [Weight only quantization to `int4`](examples/quantization_w4a16)
 * [Quantizing MoE LLMs](examples/quantizing_moe)
-* [Quantizing Multimodal VLMs](examples/multimodal_vision)
+* [Quantizing Vision-Language Models](examples/multimodal_vision)
+* [Quantizing Audio-Language Models](examples/multimodal_audio)
 
 ### User Guides
 Deep dives into advanced usage of `llmcompressor`:
diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
new file mode 100644
index 000000000..507789490
--- /dev/null
+++ b/examples/multimodal_audio/README.md
@@ -0,0 +1,88 @@
+# Quantizing Multimodal Audio Models #
+
+https://github.com/user-attachments/assets/6732c60b-1ebe-4bed-b409-c16c4415dff5
+
+Audio provided by Daniel Galvez et al. under creative commons license
+
+``` 
+<|startoftranscript|> <|en|>
+...
+
+<|transcribe|> <|notimestamps|>
+that's where you have a lot of windows in the south no actually that's passive solar
+and passive solar is something that was developed and designed in the 1960s and 70s
+and it was a great thing for what it was at the time but it's not a passive house
+```
+</em>
+
+This directory contains example scripts for quantizing a variety of audio language models using the GPTQ quantization.
+
+## Compressing Your Own Model ##
+To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
+```python3
+model_id = "path/to/your/model"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="auto",
+)
+```
+
+## Customizing GPTQModifier Parameters ##
+The GPTQModifier is the modifier responsible for performing quantization of the model weights. For more information on quantizing with different weight schemes, see the `quantization_` examples in the [examples folder](/examples/).
+
+```python3
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["WhisperEncoderLayer", "WhisperDecoderLayer"],
+        ignore=["lm_head"],
+    )
+]
+```
+
+### Sequential Targets ###
+Sequential targets are the modules which determine the granularity of error propagation and activation offloading when performing forward passes of the model. These are typically the "transformer blocks" of the model, also referred to as "layers" with llm-compressor.
+
+Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
+
+### Ignore ###
+If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
+
+## Tracing Errors ##
+Because the architectures of audio-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
+
+## Adding Your Own Smoothquant Mappings ##
+For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
+
+## Adding Your Own Data Collator ##
+Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
+
+## Sample Audio Provided Under a Creative Commons Attribution License ##
+https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{DBLP:journals/corr/abs-2111-09344,
+  author    = {Daniel Galvez and
+               Greg Diamos and
+               Juan Ciro and
+               Juan Felipe Cer{\'{o}}n and
+               Keith Achorn and
+               Anjali Gopi and
+               David Kanter and
+               Maximilian Lam and
+               Mark Mazumder and
+               Vijay Janapa Reddi},
+  title     = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition
+               Dataset for Commercial Usage},
+  journal   = {CoRR},
+  volume    = {abs/2111.09344},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2111.09344},
+  eprinttype = {arXiv},
+  eprint    = {2111.09344},
+  timestamp = {Mon, 22 Nov 2021 16:44:07 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
\ No newline at end of file
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
new file mode 100644
index 000000000..303c9e935
--- /dev/null
+++ b/examples/multimodal_audio/whisper_example.py
@@ -0,0 +1,116 @@
+import torch
+from datasets import load_dataset
+from transformers import WhisperProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
+
+# Select model and load it.
+MODEL_ID = "openai/whisper-large-v2"
+
+model = TraceableWhisperForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+model.config.forced_decoder_ids = None
+processor = WhisperProcessor.from_pretrained(MODEL_ID)
+
+# Configure processor the dataset task.
+processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
+
+# Select calibration dataset.
+DATASET_ID = "MLCommons/peoples_speech"
+DATASET_SUBSET = "test"
+DATASET_SPLIT = "test"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(
+    DATASET_ID,
+    DATASET_SUBSET,
+    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
+    trust_remote_code=True,
+)
+
+
+def preprocess(example):
+    return {
+        "array": example["audio"]["array"],
+        "sampling_rate": example["audio"]["sampling_rate"],
+        "text": " " + example["text"].capitalize(),
+    }
+
+
+ds = ds.map(preprocess, remove_columns=ds.column_names)
+
+
+# Process inputs.
+def process(sample):
+    audio_inputs = processor(
+        audio=sample["array"],
+        sampling_rate=sample["sampling_rate"],
+        return_tensors="pt",
+    )
+
+    text_inputs = processor(
+        text=sample["text"], add_special_tokens=True, return_tensors="pt"
+    )
+    text_inputs["decoder_input_ids"] = text_inputs["input_ids"]
+    del text_inputs["input_ids"]
+
+    return dict(**audio_inputs, **text_inputs)
+
+
+ds = ds.map(process, remove_columns=ds.column_names)
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
+# Recipe
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+]
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+sample_features = next(iter(ds))["input_features"]
+sample_decoder_ids = [processor.tokenizer.prefix_tokens]
+sample_input = {
+    "input_features": torch.tensor(sample_features).to(model.device),
+    "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
+}
+
+output = model.generate(**sample_input, language="en")
+print(processor.batch_decode(output, skip_special_tokens=True))
+print("==========================================\n\n")
+# that's where you have a lot of windows in the south no actually that's passive solar
+# and passive solar is something that was developed and designed in the 1960s and 70s
+# and it was a great thing for what it was at the time but it's not a passive house
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md
index 69f31ffb0..2f5ba83c1 100644
--- a/examples/multimodal_vision/README.md
+++ b/examples/multimodal_vision/README.md
@@ -61,4 +61,22 @@ Because the architectures of vision-language models is often times more complex
 For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
 
 ## Adding Your Own Data Collator ##
-Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
\ No newline at end of file
+Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
+
+## Sample Image Provided Under a Creative Commons Attribution License ##
+https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{cocodataset,
+  author    = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick},
+  title     = {Microsoft {COCO:} Common Objects in Context},
+  journal   = {CoRR},
+  volume    = {abs/1405.0312},
+  year      = {2014},
+  url       = {http://arxiv.org/abs/1405.0312},
+  archivePrefix = {arXiv},
+  eprint    = {1405.0312},
+  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
\ No newline at end of file
diff --git a/src/llmcompressor/modifiers/smoothquant/utils.py b/src/llmcompressor/modifiers/smoothquant/utils.py
index a3af344e6..a2a597215 100644
--- a/src/llmcompressor/modifiers/smoothquant/utils.py
+++ b/src/llmcompressor/modifiers/smoothquant/utils.py
@@ -52,6 +52,16 @@
         smooth_layers="re:.*post_attention_layernorm",
     ),
 ]
+WHISPER_V2_SMOOTHQUANT_MAPPINGS: List[LayerMap] = [
+    LayerMap(
+        balance_layers=["re:.*k_proj", "re:.*v_proj", "re:.*q_proj"],
+        smooth_layers="re:.*self_attn_layer_norm",
+    ),
+    LayerMap(
+        balance_layers=["re:.*fc1"],
+        smooth_layers="re:.*final_layer_norm",
+    ),
+]
 
 
 # Registry of layer mappings for different architectures
@@ -64,6 +74,7 @@
     "BloomForCausalLM": BLOOM_SMOOTHQUANT_MAPPINGS,
     "ChatGLMForConditionalGeneration": BLOOM_SMOOTHQUANT_MAPPINGS,
     "Phi3VForCausalLM": PHI3_VISION_SMOOTHQUANT_MAPPINGS,
+    "WhisperForConditionalGeneration": WHISPER_V2_SMOOTHQUANT_MAPPINGS,
 }
 
 
diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py
index d8df42c93..39410a1ef 100644
--- a/src/llmcompressor/transformers/tracing/__init__.py
+++ b/src/llmcompressor/transformers/tracing/__init__.py
@@ -10,10 +10,14 @@
 from .idefics3 import (
     Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration
 )
+from .whisper import (
+    WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration
+)
 
 __all__ = [
     "TraceableLlavaForConditionalGeneration",
     "TraceableMllamaForConditionalGeneration",
     "TraceableQwen2VLForConditionalGeneration",
-    "TraceableIdefics3ForConditionalGeneration"
+    "TraceableIdefics3ForConditionalGeneration",
+    "TraceableWhisperForConditionalGeneration",
 ]
diff --git a/src/llmcompressor/transformers/tracing/whisper.py b/src/llmcompressor/transformers/tracing/whisper.py
new file mode 100644
index 000000000..6e245760c
--- /dev/null
+++ b/src/llmcompressor/transformers/tracing/whisper.py
@@ -0,0 +1,152 @@
+# flake8: noqa
+# coding=utf-8
+# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# vllm-project: no copyright
+"""PyTorch Whisper model."""
+
+import torch
+from torch import nn
+
+from transformers import WhisperConfig
+from transformers.models.whisper.modeling_whisper import (
+    WhisperEncoder,
+    WhisperDecoder,
+    WhisperModel,
+    WhisperForConditionalGeneration,
+    WhisperForAudioClassification,
+)
+from transformers.modeling_outputs import BaseModelOutput
+
+
+class WhisperEncoder(WhisperEncoder):
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        # TRACING: assume preprocessing is correct
+        # if input_features.shape[-1] != expected_seq_length:
+        if False:
+            raise ValueError(
+                f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class WhisperModel(WhisperModel):
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        self.decoder = WhisperDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+class WhisperForConditionalGeneration(WhisperForConditionalGeneration):
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.model = WhisperModel(config)
+        self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.max_target_positions = config.max_target_positions
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class WhisperForAudioClassification(WhisperForAudioClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
\ No newline at end of file

From 999d6600e3112c01f281085d5ebb7b934f1f4c0e Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 29 Jan 2025 18:20:23 -0500
Subject: [PATCH 14/15] [Audio] Add whisper fp8 dynamic example (#1111)

## Purpose ##
* Add example of quantizing multimodal model with FP8 dynamic

## Changes ##
* Add whisper FP8 example, collaborated with @mgoin

## Testing ##
* Ran example to mention

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../quantization_w8a8_fp8/whisper_example.py  | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 examples/quantization_w8a8_fp8/whisper_example.py

diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
new file mode 100644
index 000000000..df18b0d11
--- /dev/null
+++ b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -0,0 +1,46 @@
+from datasets import load_dataset
+from transformers import AutoProcessor, WhisperForConditionalGeneration
+
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.transformers import oneshot
+
+MODEL_ID = "openai/whisper-large-v2"
+
+# Load model.
+model = WhisperForConditionalGeneration.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+model.config.forced_decoder_ids = None
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with per channel via ptq
+#   * quantize the activations to fp8 with dynamic per token
+recipe = QuantizationModifier(
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+ds = load_dataset(
+    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]"
+)
+sample = ds[0]["audio"]
+input_features = processor(
+    sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
+).input_features
+input_features = input_features.to(model.device)
+predicted_ids = model.generate(input_features, language="en", forced_decoder_ids=None)
+print(processor.batch_decode(predicted_ids, skip_special_tokens=False)[0])
+# Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)

From 317c10ca901901c43f767c50aac8362e2c930448 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 30 Jan 2025 16:36:59 -0500
Subject: [PATCH 15/15] [VLM] Update pixtral data collator to reflect latest
 transformers changes (#1116)

## Purpose ##
* In transformers==4.48.0, the Pixtral processor was updated to not add
an additional layer of wrapping for `pixel_values`
(https://github.com/huggingface/transformers/pull/34801). This is more
inline with how other processors handle multimodal inputs
* Because previously the data_collator was being used to unwrap this
unnecessary wrapping, attempting to quantize pixtral with
transformers>=4.48.0 fails

## Changes ##
* Update pixtral data collator to match latest transformers version
* Add comment for those who want to use transformers<4.48.0

## Testing ##
* Ran pixtral example to completion, @shubhra ran pixtral large

---------

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_vision/pixtral_example.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
index ebb18df12..891819bc6 100644
--- a/examples/multimodal_vision/pixtral_example.py
+++ b/examples/multimodal_vision/pixtral_example.py
@@ -16,18 +16,20 @@
 
 # Oneshot arguments
 DATASET_ID = "flickr30k"
-DATASET_SPLIT = {"calibration": "test[:512]"}
 NUM_CALIBRATION_SAMPLES = 512
+DATASET_SPLIT = {"calibration": f"test[:{NUM_CALIBRATION_SAMPLES}]"}
 MAX_SEQUENCE_LENGTH = 2048
 
 
 # Define a oneshot data collator for multimodal inputs.
+# NOTE: for transformers<4.48.0, please squeeze the first dimension of `pixel_values`
+# by appending `[0]` to the end of line 32
 def data_collator(batch):
     assert len(batch) == 1
     return {
         "input_ids": torch.LongTensor(batch[0]["input_ids"]),
         "attention_mask": torch.tensor(batch[0]["attention_mask"]),
-        "pixel_values": torch.tensor(batch[0]["pixel_values"])[0],
+        "pixel_values": torch.tensor(batch[0]["pixel_values"]),
     }