From 470b42558e181b218139c93a6661619552592134 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 00:23:42 +0000
Subject: [PATCH 01/21] WIP

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/whisper_example.py  | 316 ++++++++++++++++++
 .../modifiers/utils/pytorch_helpers.py        |   4 +-
 .../transformers/finetune/data/base.py        |   2 +-
 .../transformers/utils/data_collator.py       |   7 +
 4 files changed, 327 insertions(+), 2 deletions(-)
 create mode 100644 examples/multimodal_audio/whisper_example.py

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
new file mode 100644
index 000000000..ca705fc89
--- /dev/null
+++ b/examples/multimodal_audio/whisper_example.py
@@ -0,0 +1,316 @@
+import torch
+from datasets import load_dataset
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.utils.data_collator import whisper_data_collator
+
+# Select model and load it.
+MODEL_ID = "openai/whisper-tiny"
+
+model = WhisperForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+model.config.forced_decoder_ids = None
+processor = WhisperProcessor.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "hf-internal-testing/librispeech_asr_dummy"
+DATASET_SPLIT = f"validation[:512]"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, "clean", split=DATASET_SPLIT)
+
+
+def preprocess(example):
+    return {
+        "array": example["audio"]["array"],
+        "sampling_rate": example["audio"]["sampling_rate"],
+    }
+
+
+ds = ds.map(preprocess, remove_columns=ds.column_names)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    generation_config = None
+    return_token_timestamps = None
+    logprob_threshold = None
+    return_timestamps = None
+    language = None
+    task = None
+    is_multilingual = None
+
+    input_features = None,
+    generation_config = None,
+    logits_processor = None,
+    stopping_criteria = None,
+    prefix_allowed_tokens_fn = None,
+    synced_gpus = False,
+    return_timestamps = None,
+    task = None,
+    language = None,
+    is_multilingual = None,
+    prompt_ids = None,
+    prompt_condition_type = None,  # first-segment, all-segments
+    condition_on_prev_tokens = None,
+    temperature = None,
+    compression_ratio_threshold = None,
+    logprob_threshold = None,
+    no_speech_threshold = None,
+    num_segment_frames = None,
+    attention_mask = None,
+    time_precision = 0.02,
+    time_precision_features = 0.01,
+    return_token_timestamps = None,
+    return_segments = False,
+    return_dict_in_generate = None,
+
+
+    input_features = processor(
+        sample["array"],
+        sampling_rate=sample["sampling_rate"],
+    ).input_features
+
+    # 1. prepare generation config
+    generation_config, kwargs = model._prepare_generation_config(generation_config, **kwargs)
+
+    # 2. set global generate variables
+    input_stride = model.model.encoder.conv1.stride[0] * model.model.encoder.conv2.stride[0]
+    num_segment_frames = input_stride * model.config.max_source_positions
+    batch_size, total_input_frames = model._retrieve_total_input_frames(
+        input_features=input_features, input_stride=input_stride, kwargs=kwargs
+    )
+    is_shortform = total_input_frames <= num_segment_frames
+
+    # 3. Make sure generation config is correctly set
+    # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
+    return_dict_in_generate = model._set_return_outputs(
+        return_dict_in_generate=return_dict_in_generate,
+        return_token_timestamps=return_token_timestamps,
+        logprob_threshold=logprob_threshold,
+        generation_config=generation_config,
+    )
+    timestamp_begin = model._set_return_timestamps(
+        return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config
+    )
+    model._set_language_and_task(
+        language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
+    )
+    model._set_num_frames(
+        return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
+    )
+    model._set_thresholds_and_condition(
+        generation_config=generation_config,
+        logprob_threshold=logprob_threshold,
+        compression_ratio_threshold=compression_ratio_threshold,
+        no_speech_threshold=no_speech_threshold,
+        condition_on_prev_tokens=condition_on_prev_tokens,
+    )
+    model._set_prompt_condition_type(
+        generation_config=generation_config,
+        prompt_condition_type=prompt_condition_type,
+    )
+
+    # pass self.config for backward compatibility
+    init_tokens = model._retrieve_init_tokens(
+        input_features,
+        batch_size=batch_size,
+        generation_config=generation_config,
+        config=model.config,
+        num_segment_frames=num_segment_frames,
+        kwargs=kwargs,
+    )
+    # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation
+    # where the input ids are handled explicitly by the generate method
+    model._check_decoder_input_ids(kwargs=kwargs)
+
+    # 3. Retrieve logits processors
+    device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
+    begin_index = init_tokens.shape[1]
+    logits_processor = model._retrieve_logit_processors(
+        generation_config=generation_config,
+        logits_processor=logits_processor,
+        begin_index=begin_index,  # begin index is index of first generated decoder token
+        num_beams=kwargs.get("num_beams", 1),
+        device=device,
+    )
+
+    # 4 Set and retrieve global generation variables
+    model._set_condition_on_prev_tokens(
+        condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config
+    )
+
+    temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature
+    temperature = temperatures[0]
+
+    max_frames, seek = model._retrieve_max_frames_and_seek(
+        batch_size=batch_size,
+        attention_mask=attention_mask,
+        total_input_frames=total_input_frames,
+        is_shortform=is_shortform,
+    )
+
+    # 5 Prepare running variables, list for generation
+    num_return_sequences = generation_config.num_return_sequences
+    (
+        batch_idx_map,
+        cur_bsz,
+        input_features,
+        seek,
+        max_frames,
+        init_tokens,
+        do_condition_on_prev_tokens,
+    ) = model._expand_variables_for_generation(
+        input_features=input_features,
+        seek=seek,
+        max_frames=max_frames,
+        init_tokens=init_tokens,
+        batch_size=batch_size,
+        condition_on_prev_tokens=condition_on_prev_tokens,
+        generation_config=generation_config,
+    )
+
+    current_segments = model._prepare_segments(
+        prompt_ids=prompt_ids,
+        batch_size=cur_bsz,
+        generation_config=generation_config,
+    )
+
+    # 6 Transcribe audio until we reach the end of all input audios
+    while (seek < max_frames).any():
+        # 6.1 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
+        # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
+        # to know which original audio is being decoded
+        # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
+        input_features, cur_bsz, batch_idx_map = model._maybe_reduce_batch(
+            input_features=input_features,
+            seek=seek,
+            max_frames=max_frames,
+            cur_bsz=cur_bsz,
+            batch_idx_map=batch_idx_map,
+        )
+        time_offset = (
+            seek.to(torch.float32 if device.type == "mps" else torch.float64) * time_precision / input_stride
+        )
+        seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
+
+        # 6.2 cut out next 30s segment from input features
+        segment_input = model._get_input_segment(
+            input_features=input_features,
+            seek=seek,
+            seek_num_frames=seek_num_frames,
+            num_segment_frames=num_segment_frames,
+            cur_bsz=cur_bsz,
+            batch_idx_map=batch_idx_map,
+        )
+
+        # 6.3 prepare decoder input ids
+        suppress_tokens = _get_attr_from_logit_processors(
+            logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens"
+        )
+
+        decoder_input_ids, kwargs = model._prepare_decoder_input_ids(
+            cur_bsz=cur_bsz,
+            init_tokens=init_tokens,
+            current_segments=current_segments,
+            batch_idx_map=batch_idx_map,
+            do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+            prompt_ids=prompt_ids,
+            generation_config=generation_config,
+            config=model.config,
+            device=init_tokens.device,
+            suppress_tokens=suppress_tokens,
+            timestamp_begin=timestamp_begin,
+            kwargs=kwargs,
+        )
+
+        # 6.4 set max new tokens or max length
+        model._set_max_new_tokens_and_length(
+            config=model.config,
+            decoder_input_ids=decoder_input_ids,
+            generation_config=generation_config,
+        )
+
+        # 6.5 Set current `begin_index` for all logit processors
+        if logits_processor is not None:
+            for proc in logits_processor:
+                if hasattr(proc, "set_begin_index"):
+                    proc.set_begin_index(decoder_input_ids.shape[-1])
+
+        # 6.6 Run generate with fallback
+        (
+            seek_sequences,
+            seek_outputs,
+            should_skip,
+            do_condition_on_prev_tokens,
+            model_output_type,
+        ) = model.generate_with_fallback(
+            segment_input=segment_input,
+            decoder_input_ids=decoder_input_ids,
+            cur_bsz=cur_bsz,
+            batch_idx_map=batch_idx_map,
+            seek=seek,
+            num_segment_frames=num_segment_frames,
+            max_frames=max_frames,
+            temperatures=temperatures,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            return_token_timestamps=return_token_timestamps,
+            do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+            is_shortform=is_shortform,
+            batch_size=batch_size,
+            attention_mask=attention_mask,
+            kwargs=kwargs,
+        )
+
+    return segment_input["input_features"]
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with GPTQ with a group size 128
+breakpoint()
+sample_input = next(iter(ds))
+output = model(**sample_input)
+
+
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=whisper_data_collator,
+)
+breakpoint()
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+sample_input = next(iter(ds))
+output = model.generate(sample_input)
+print(processor.batch_decode(output, skip_special_tokens=True))
+#[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
\ No newline at end of file
diff --git a/src/llmcompressor/modifiers/utils/pytorch_helpers.py b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
index c9869f267..444a0bac2 100644
--- a/src/llmcompressor/modifiers/utils/pytorch_helpers.py
+++ b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
@@ -41,7 +41,9 @@ def apply_pad_mask_to_batch(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     :param batch: batch to apply padding to if it exists
     :return: batch with padding zeroed out in the input_ids
     """
-    batch["input_ids"] = batch["input_ids"] * batch["attention_mask"]
+    print(batch.keys())
+    if "input_ids" in batch and "attention_mask" in batch:
+        batch["input_ids"] = batch["input_ids"] * batch["attention_mask"]
     return batch
 
 
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index 81a3fc95f..8a4c150cd 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -105,7 +105,7 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
         dataset = self.rename_columns(dataset)
         logger.debug(f"Dataset after column renaming: {get_columns(dataset)}")
 
-        if "input_ids" not in get_columns(dataset):
+        if "input_ids" not in get_columns(dataset) and "input_features" not in get_columns(dataset):
             # tokenize/ process
             dataset = self.filter_tokenizer_args(dataset)
             logger.debug(f"Tokenizer args after filtering: {get_columns(dataset)}")
diff --git a/src/llmcompressor/transformers/utils/data_collator.py b/src/llmcompressor/transformers/utils/data_collator.py
index b2dc7c651..b4e13e60e 100644
--- a/src/llmcompressor/transformers/utils/data_collator.py
+++ b/src/llmcompressor/transformers/utils/data_collator.py
@@ -46,3 +46,10 @@ def qwen2_vl_data_collator(batch):
         "pixel_values": torch.tensor(batch[0]["pixel_values"]),
         "image_grid_thw": torch.tensor(batch[0]["image_grid_thw"]),
     }
+
+
+def whisper_data_collator(batch):
+    assert len(batch) == 1
+    return {
+        "input_features": torch.LongTensor(batch[0]["input_features"]),
+    }

From 5276c9f3a0ca7776bacee18cc40f34e7e224adf2 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 01:12:58 +0000
Subject: [PATCH 02/21] WIP: traceable, sample generation WIP

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/whisper_example.py  | 277 +++---------------
 .../transformers/tracing/__init__.py          |   4 +
 .../transformers/tracing/whisper.py           | 151 ++++++++++
 .../transformers/utils/data_collator.py       |   3 +-
 4 files changed, 196 insertions(+), 239 deletions(-)
 create mode 100644 src/llmcompressor/transformers/tracing/whisper.py

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index ca705fc89..228f7718f 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -1,15 +1,16 @@
 import torch
 from datasets import load_dataset
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
+from transformers import WhisperProcessor
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.utils.data_collator import whisper_data_collator
+from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
 
 # Select model and load it.
 MODEL_ID = "openai/whisper-tiny"
 
-model = WhisperForConditionalGeneration.from_pretrained(
+model = TraceableWhisperForConditionalGeneration.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",
@@ -19,11 +20,11 @@
 
 # Select calibration dataset.
 DATASET_ID = "hf-internal-testing/librispeech_asr_dummy"
-DATASET_SPLIT = f"validation[:512]"
+DATASET_SPLIT = f"validation[:1]"
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
+NUM_CALIBRATION_SAMPLES = 1 # 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -39,253 +40,52 @@ def preprocess(example):
 
 ds = ds.map(preprocess, remove_columns=ds.column_names)
 
+r"""
+Returns:
 
-# Tokenize inputs.
-def tokenize(sample):
-    generation_config = None
-    return_token_timestamps = None
-    logprob_threshold = None
-    return_timestamps = None
-    language = None
-    task = None
-    is_multilingual = None
+Example:
+    ```python
+    >>> import torch
+    >>> from transformers import AutoFeatureExtractor, WhisperModel
+    >>> from datasets import load_dataset
 
-    input_features = None,
-    generation_config = None,
-    logits_processor = None,
-    stopping_criteria = None,
-    prefix_allowed_tokens_fn = None,
-    synced_gpus = False,
-    return_timestamps = None,
-    task = None,
-    language = None,
-    is_multilingual = None,
-    prompt_ids = None,
-    prompt_condition_type = None,  # first-segment, all-segments
-    condition_on_prev_tokens = None,
-    temperature = None,
-    compression_ratio_threshold = None,
-    logprob_threshold = None,
-    no_speech_threshold = None,
-    num_segment_frames = None,
-    attention_mask = None,
-    time_precision = 0.02,
-    time_precision_features = 0.01,
-    return_token_timestamps = None,
-    return_segments = False,
-    return_dict_in_generate = None,
+    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+    >>> list(last_hidden_state.shape)
+    [1, 2, 512]
+    ```
+"""
 
 
+# Tokenize inputs.
+def tokenize(sample):
+    batch_size = 1
     input_features = processor(
         sample["array"],
         sampling_rate=sample["sampling_rate"],
+        return_tensors="pt",
     ).input_features
 
-    # 1. prepare generation config
-    generation_config, kwargs = model._prepare_generation_config(generation_config, **kwargs)
-
-    # 2. set global generate variables
-    input_stride = model.model.encoder.conv1.stride[0] * model.model.encoder.conv2.stride[0]
-    num_segment_frames = input_stride * model.config.max_source_positions
-    batch_size, total_input_frames = model._retrieve_total_input_frames(
-        input_features=input_features, input_stride=input_stride, kwargs=kwargs
-    )
-    is_shortform = total_input_frames <= num_segment_frames
-
-    # 3. Make sure generation config is correctly set
-    # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
-    return_dict_in_generate = model._set_return_outputs(
-        return_dict_in_generate=return_dict_in_generate,
-        return_token_timestamps=return_token_timestamps,
-        logprob_threshold=logprob_threshold,
-        generation_config=generation_config,
-    )
-    timestamp_begin = model._set_return_timestamps(
-        return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config
-    )
-    model._set_language_and_task(
-        language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
-    )
-    model._set_num_frames(
-        return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
-    )
-    model._set_thresholds_and_condition(
-        generation_config=generation_config,
-        logprob_threshold=logprob_threshold,
-        compression_ratio_threshold=compression_ratio_threshold,
-        no_speech_threshold=no_speech_threshold,
-        condition_on_prev_tokens=condition_on_prev_tokens,
-    )
-    model._set_prompt_condition_type(
-        generation_config=generation_config,
-        prompt_condition_type=prompt_condition_type,
-    )
-
-    # pass self.config for backward compatibility
-    init_tokens = model._retrieve_init_tokens(
-        input_features,
-        batch_size=batch_size,
-        generation_config=generation_config,
-        config=model.config,
-        num_segment_frames=num_segment_frames,
-        kwargs=kwargs,
-    )
-    # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation
-    # where the input ids are handled explicitly by the generate method
-    model._check_decoder_input_ids(kwargs=kwargs)
-
-    # 3. Retrieve logits processors
-    device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
-    begin_index = init_tokens.shape[1]
-    logits_processor = model._retrieve_logit_processors(
-        generation_config=generation_config,
-        logits_processor=logits_processor,
-        begin_index=begin_index,  # begin index is index of first generated decoder token
-        num_beams=kwargs.get("num_beams", 1),
-        device=device,
-    )
-
-    # 4 Set and retrieve global generation variables
-    model._set_condition_on_prev_tokens(
-        condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config
-    )
-
-    temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature
-    temperature = temperatures[0]
-
-    max_frames, seek = model._retrieve_max_frames_and_seek(
-        batch_size=batch_size,
-        attention_mask=attention_mask,
-        total_input_frames=total_input_frames,
-        is_shortform=is_shortform,
-    )
-
-    # 5 Prepare running variables, list for generation
-    num_return_sequences = generation_config.num_return_sequences
-    (
-        batch_idx_map,
-        cur_bsz,
-        input_features,
-        seek,
-        max_frames,
-        init_tokens,
-        do_condition_on_prev_tokens,
-    ) = model._expand_variables_for_generation(
-        input_features=input_features,
-        seek=seek,
-        max_frames=max_frames,
-        init_tokens=init_tokens,
-        batch_size=batch_size,
-        condition_on_prev_tokens=condition_on_prev_tokens,
-        generation_config=generation_config,
-    )
-
-    current_segments = model._prepare_segments(
-        prompt_ids=prompt_ids,
-        batch_size=cur_bsz,
-        generation_config=generation_config,
-    )
+    decoder_input_ids = torch.ones((batch_size, 1), dtype=torch.long) * model.config.decoder_start_token_id
 
-    # 6 Transcribe audio until we reach the end of all input audios
-    while (seek < max_frames).any():
-        # 6.1 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
-        # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
-        # to know which original audio is being decoded
-        # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
-        input_features, cur_bsz, batch_idx_map = model._maybe_reduce_batch(
-            input_features=input_features,
-            seek=seek,
-            max_frames=max_frames,
-            cur_bsz=cur_bsz,
-            batch_idx_map=batch_idx_map,
-        )
-        time_offset = (
-            seek.to(torch.float32 if device.type == "mps" else torch.float64) * time_precision / input_stride
-        )
-        seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
-
-        # 6.2 cut out next 30s segment from input features
-        segment_input = model._get_input_segment(
-            input_features=input_features,
-            seek=seek,
-            seek_num_frames=seek_num_frames,
-            num_segment_frames=num_segment_frames,
-            cur_bsz=cur_bsz,
-            batch_idx_map=batch_idx_map,
-        )
-
-        # 6.3 prepare decoder input ids
-        suppress_tokens = _get_attr_from_logit_processors(
-            logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens"
-        )
-
-        decoder_input_ids, kwargs = model._prepare_decoder_input_ids(
-            cur_bsz=cur_bsz,
-            init_tokens=init_tokens,
-            current_segments=current_segments,
-            batch_idx_map=batch_idx_map,
-            do_condition_on_prev_tokens=do_condition_on_prev_tokens,
-            prompt_ids=prompt_ids,
-            generation_config=generation_config,
-            config=model.config,
-            device=init_tokens.device,
-            suppress_tokens=suppress_tokens,
-            timestamp_begin=timestamp_begin,
-            kwargs=kwargs,
-        )
-
-        # 6.4 set max new tokens or max length
-        model._set_max_new_tokens_and_length(
-            config=model.config,
-            decoder_input_ids=decoder_input_ids,
-            generation_config=generation_config,
-        )
-
-        # 6.5 Set current `begin_index` for all logit processors
-        if logits_processor is not None:
-            for proc in logits_processor:
-                if hasattr(proc, "set_begin_index"):
-                    proc.set_begin_index(decoder_input_ids.shape[-1])
-
-        # 6.6 Run generate with fallback
-        (
-            seek_sequences,
-            seek_outputs,
-            should_skip,
-            do_condition_on_prev_tokens,
-            model_output_type,
-        ) = model.generate_with_fallback(
-            segment_input=segment_input,
-            decoder_input_ids=decoder_input_ids,
-            cur_bsz=cur_bsz,
-            batch_idx_map=batch_idx_map,
-            seek=seek,
-            num_segment_frames=num_segment_frames,
-            max_frames=max_frames,
-            temperatures=temperatures,
-            generation_config=generation_config,
-            logits_processor=logits_processor,
-            stopping_criteria=stopping_criteria,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            synced_gpus=synced_gpus,
-            return_token_timestamps=return_token_timestamps,
-            do_condition_on_prev_tokens=do_condition_on_prev_tokens,
-            is_shortform=is_shortform,
-            batch_size=batch_size,
-            attention_mask=attention_mask,
-            kwargs=kwargs,
-        )
-
-    return segment_input["input_features"]
+    return {
+        "input_features": input_features,
+        "decoder_input_ids": decoder_input_ids
+    }
 
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
-breakpoint()
-sample_input = next(iter(ds))
-output = model(**sample_input)
+#breakpoint()
+#sample_input = next(iter(ds))
+#output = model(**sample_input)
 
 
 recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
@@ -299,13 +99,14 @@ def tokenize(sample):
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     data_collator=whisper_data_collator,
 )
-breakpoint()
 
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-sample_input = next(iter(ds))
-output = model.generate(sample_input)
+sample_input = whisper_data_collator([next(iter(ds))]).to(model.device)
+sample_input = {k: v.to("cuda:0") for k, v in sample_input.items()}
+output = model.generate(**sample_input)
+breakpoint()
 print(processor.batch_decode(output, skip_special_tokens=True))
 #[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']
 print("==========================================\n\n")
diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py
index 4baa5864d..b9202d95b 100644
--- a/src/llmcompressor/transformers/tracing/__init__.py
+++ b/src/llmcompressor/transformers/tracing/__init__.py
@@ -5,9 +5,13 @@
 from .mllama import (
     MllamaForConditionalGeneration as TraceableMllamaForConditionalGeneration,
 )
+from .whisper import (
+    WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration
+)
 
 __all__ = [
     "TraceableLlavaForConditionalGeneration",
     "TraceableMllamaForConditionalGeneration",
     "TraceableMistralForCausalLM",
+    "TraceableWhisperForConditionalGeneration",
 ]
diff --git a/src/llmcompressor/transformers/tracing/whisper.py b/src/llmcompressor/transformers/tracing/whisper.py
new file mode 100644
index 000000000..b8241f764
--- /dev/null
+++ b/src/llmcompressor/transformers/tracing/whisper.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# vllm-project: no copyright
+"""PyTorch Whisper model."""
+
+import torch
+from torch import nn
+
+from transformers import WhisperConfig
+from transformers.models.whisper.modeling_whisper import (
+    WhisperEncoder,
+    WhisperDecoder,
+    WhisperModel,
+    WhisperForConditionalGeneration,
+    WhisperForAudioClassification,
+)
+from transformers.modeling_outputs import BaseModelOutput
+
+
+class WhisperEncoder(WhisperEncoder):
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        # TRACING: assume preprocessing is correct
+        # if input_features.shape[-1] != expected_seq_length:
+        if False:
+            raise ValueError(
+                f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class WhisperModel(WhisperModel):
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        self.decoder = WhisperDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+class WhisperForConditionalGeneration(WhisperForConditionalGeneration):
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.model = WhisperModel(config)
+        self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.max_target_positions = config.max_target_positions
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+
+class WhisperForAudioClassification(WhisperForAudioClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
\ No newline at end of file
diff --git a/src/llmcompressor/transformers/utils/data_collator.py b/src/llmcompressor/transformers/utils/data_collator.py
index b4e13e60e..111e12b07 100644
--- a/src/llmcompressor/transformers/utils/data_collator.py
+++ b/src/llmcompressor/transformers/utils/data_collator.py
@@ -51,5 +51,6 @@ def qwen2_vl_data_collator(batch):
 def whisper_data_collator(batch):
     assert len(batch) == 1
     return {
-        "input_features": torch.LongTensor(batch[0]["input_features"]),
+        "input_features": torch.tensor(batch[0]["input_features"]),
+        "decoder_input_ids": torch.tensor(batch[0]["decoder_input_ids"]),
     }

From 898f86ec4f9cd20fd2271e916c53b8644d2555f3 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 01:32:54 +0000
Subject: [PATCH 03/21] WIP: working, need to change ds split

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/whisper_example.py  | 63 +++++++------------
 .../transformers/finetune/data/base.py        |  4 +-
 .../transformers/tracing/whisper.py           |  1 +
 3 files changed, 27 insertions(+), 41 deletions(-)

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 228f7718f..dd3bf2fb1 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -1,11 +1,10 @@
-import torch
 from datasets import load_dataset
 from transformers import WhisperProcessor
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.transformers.utils.data_collator import whisper_data_collator
 from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
+from llmcompressor.transformers.utils.data_collator import whisper_data_collator
 
 # Select model and load it.
 MODEL_ID = "openai/whisper-tiny"
@@ -20,11 +19,11 @@
 
 # Select calibration dataset.
 DATASET_ID = "hf-internal-testing/librispeech_asr_dummy"
-DATASET_SPLIT = f"validation[:1]"
+DATASET_SPLIT = "validation[:512]"
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 1 # 512
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -40,27 +39,6 @@ def preprocess(example):
 
 ds = ds.map(preprocess, remove_columns=ds.column_names)
 
-r"""
-Returns:
-
-Example:
-    ```python
-    >>> import torch
-    >>> from transformers import AutoFeatureExtractor, WhisperModel
-    >>> from datasets import load_dataset
-
-    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
-    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
-    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
-    >>> input_features = inputs.input_features
-    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
-    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
-    >>> list(last_hidden_state.shape)
-    [1, 2, 512]
-    ```
-"""
-
 
 # Tokenize inputs.
 def tokenize(sample):
@@ -71,23 +49,29 @@ def tokenize(sample):
         return_tensors="pt",
     ).input_features
 
-    decoder_input_ids = torch.ones((batch_size, 1), dtype=torch.long) * model.config.decoder_start_token_id
+    generation_config, _kwargs = model._prepare_generation_config(None)
 
-    return {
-        "input_features": input_features,
-        "decoder_input_ids": decoder_input_ids
-    }
+    input_stride = (
+        model.model.encoder.conv1.stride[0] * model.model.encoder.conv2.stride[0]
+    )
+    num_segment_frames = input_stride * model.config.max_source_positions
+
+    decoder_input_ids = model._retrieve_init_tokens(
+        input_features,
+        batch_size=batch_size,
+        generation_config=generation_config,
+        config=model.config,
+        num_segment_frames=num_segment_frames,
+        kwargs={},
+    )
+
+    return {"input_features": input_features, "decoder_input_ids": decoder_input_ids}
 
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
-#breakpoint()
-#sample_input = next(iter(ds))
-#output = model(**sample_input)
-
-
 recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
 
 # Apply algorithms.
@@ -103,15 +87,14 @@ def tokenize(sample):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-sample_input = whisper_data_collator([next(iter(ds))]).to(model.device)
+sample_input = whisper_data_collator([next(iter(ds))])
 sample_input = {k: v.to("cuda:0") for k, v in sample_input.items()}
-output = model.generate(**sample_input)
-breakpoint()
+output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
-#[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']
+# Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
 print("==========================================\n\n")
 
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
\ No newline at end of file
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index 8a4c150cd..a5ec88570 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -105,7 +105,9 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
         dataset = self.rename_columns(dataset)
         logger.debug(f"Dataset after column renaming: {get_columns(dataset)}")
 
-        if "input_ids" not in get_columns(dataset) and "input_features" not in get_columns(dataset):
+        if "input_ids" not in get_columns(
+            dataset
+        ) and "input_features" not in get_columns(dataset):
             # tokenize/ process
             dataset = self.filter_tokenizer_args(dataset)
             logger.debug(f"Tokenizer args after filtering: {get_columns(dataset)}")
diff --git a/src/llmcompressor/transformers/tracing/whisper.py b/src/llmcompressor/transformers/tracing/whisper.py
index b8241f764..6e245760c 100644
--- a/src/llmcompressor/transformers/tracing/whisper.py
+++ b/src/llmcompressor/transformers/tracing/whisper.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 # coding=utf-8
 # Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
 #

From 8ca9b6d718826fd57a08050b338e8f46a905e264 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 01:35:52 +0000
Subject: [PATCH 04/21] readme todo

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/README.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 examples/multimodal_audio/README.md

diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
new file mode 100644
index 000000000..30404ce4c
--- /dev/null
+++ b/examples/multimodal_audio/README.md
@@ -0,0 +1 @@
+TODO
\ No newline at end of file

From 98aca1686ded0947364e381f654930ec7f58b7c3 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 05:46:39 +0000
Subject: [PATCH 05/21] split to peoples_speech dataset

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../multimodal_audio/qwen2_audio_example.py   | 118 ++++++++++++++++++
 examples/multimodal_audio/whisper_example.py  |  14 ++-
 2 files changed, 128 insertions(+), 4 deletions(-)
 create mode 100644 examples/multimodal_audio/qwen2_audio_example.py

diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
new file mode 100644
index 000000000..a3a1d6b13
--- /dev/null
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -0,0 +1,118 @@
+from datasets import load_dataset
+from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
+from llmcompressor.transformers.utils.data_collator import whisper_data_collator
+
+# Select model and load it.
+MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
+
+model = Qwen2AudioForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "MLCommons/peoples_speech"
+DATASET_SUBSET = "test"
+DATASET_SPLIT = "test"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(
+    DATASET_ID,
+    DATASET_SUBSET,
+    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
+    trust_remote_code=True,
+)
+
+
+def preprocess(example):
+    conversation = [
+        {"role": "user", "content": [
+            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
+        ]},
+        {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
+        {"role": "user", "content": [
+            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
+        ]},
+    ]
+    return {
+        "text": processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    }
+    return {
+        "array": example["audio"]["array"],
+        "sampling_rate": example["audio"]["sampling_rate"],
+    }
+
+
+ds = ds.map(preprocess, remove_columns=ds.column_names)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    batch_size = 1
+    input_features = processor(
+        sample["array"],
+        sampling_rate=sample["sampling_rate"],
+        return_tensors="pt",
+    ).input_features
+
+    generation_config, _kwargs = model._prepare_generation_config(None)
+
+    input_stride = (
+        model.model.encoder.conv1.stride[0] * model.model.encoder.conv2.stride[0]
+    )
+    num_segment_frames = input_stride * model.config.max_source_positions
+
+    decoder_input_ids = model._retrieve_init_tokens(
+        input_features,
+        batch_size=batch_size,
+        generation_config=generation_config,
+        config=model.config,
+        num_segment_frames=num_segment_frames,
+        kwargs={},
+    )
+
+    return {"input_features": input_features, "decoder_input_ids": decoder_input_ids}
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm to run.
+#   * quantize the weights to 4 bit with GPTQ with a group size 128
+recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=whisper_data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+sample_input = whisper_data_collator([next(iter(ds))])
+sample_input = {k: v.to("cuda:0") for k, v in sample_input.items()}
+output = model.generate(**sample_input, language="en")
+print(processor.batch_decode(output, skip_special_tokens=True)[0])
+print("==========================================\n\n")
+# If you are interested in doing something to your house, go to the green building
+# adviser, look it up and see what the experts are talking about
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index dd3bf2fb1..b87185252 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -18,8 +18,9 @@
 processor = WhisperProcessor.from_pretrained(MODEL_ID)
 
 # Select calibration dataset.
-DATASET_ID = "hf-internal-testing/librispeech_asr_dummy"
-DATASET_SPLIT = "validation[:512]"
+DATASET_ID = "MLCommons/peoples_speech"
+DATASET_SUBSET = "test"
+DATASET_SPLIT = "test"
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
@@ -27,7 +28,12 @@
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
-ds = load_dataset(DATASET_ID, "clean", split=DATASET_SPLIT)
+ds = load_dataset(
+    DATASET_ID,
+    DATASET_SUBSET,
+    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
+    trust_remote_code=True,
+)
 
 
 def preprocess(example):
@@ -91,8 +97,8 @@ def tokenize(sample):
 sample_input = {k: v.to("cuda:0") for k, v in sample_input.items()}
 output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
-# Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
 print("==========================================\n\n")
+# The track appears on the compilation album "Kraftworks"
 
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"

From 7067c3f6d087012c020ecd42a2301b11eb4585c2 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 06:09:24 +0000
Subject: [PATCH 06/21] use cleanup example, add todo check

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/whisper_example.py       | 14 ++++++--------
 .../transformers/finetune/data/base.py             |  1 +
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index b87185252..ad7830d6f 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -7,7 +7,7 @@
 from llmcompressor.transformers.utils.data_collator import whisper_data_collator
 
 # Select model and load it.
-MODEL_ID = "openai/whisper-tiny"
+MODEL_ID = "openai/whisper-base"
 
 model = TraceableWhisperForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -46,25 +46,23 @@ def preprocess(example):
 ds = ds.map(preprocess, remove_columns=ds.column_names)
 
 
-# Tokenize inputs.
-def tokenize(sample):
-    batch_size = 1
+# Process inputs.
+def process(sample):
     input_features = processor(
         sample["array"],
         sampling_rate=sample["sampling_rate"],
         return_tensors="pt",
     ).input_features
 
+    # decoder_input_ids define the task context
     generation_config, _kwargs = model._prepare_generation_config(None)
-
     input_stride = (
         model.model.encoder.conv1.stride[0] * model.model.encoder.conv2.stride[0]
     )
     num_segment_frames = input_stride * model.config.max_source_positions
-
     decoder_input_ids = model._retrieve_init_tokens(
         input_features,
-        batch_size=batch_size,
+        batch_size=1,
         generation_config=generation_config,
         config=model.config,
         num_segment_frames=num_segment_frames,
@@ -74,7 +72,7 @@ def tokenize(sample):
     return {"input_features": input_features, "decoder_input_ids": decoder_input_ids}
 
 
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+ds = ds.map(process, remove_columns=ds.column_names)
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index a5ec88570..10e014949 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -105,6 +105,7 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
         dataset = self.rename_columns(dataset)
         logger.debug(f"Dataset after column renaming: {get_columns(dataset)}")
 
+        # TODO: investigate processor.model_input_names
         if "input_ids" not in get_columns(
             dataset
         ) and "input_features" not in get_columns(dataset):

From 0848fb64649741162b9ad0ffb6adc2c4283dae82 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 06:29:27 +0000
Subject: [PATCH 07/21] qwen2, need to add traceability

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../multimodal_audio/qwen2_audio_example.py   |   67 +-
 examples/multimodal_audio/whisper_example.py  |    2 +-
 .../transformers/tracing/qwen2_audio.py       | 1368 +++++++++++++++++
 .../transformers/utils/data_collator.py       |   10 +
 4 files changed, 1401 insertions(+), 46 deletions(-)
 create mode 100644 src/llmcompressor/transformers/tracing/qwen2_audio.py

diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
index a3a1d6b13..2f0f1cc35 100644
--- a/examples/multimodal_audio/qwen2_audio_example.py
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -1,10 +1,9 @@
 from datasets import load_dataset
-from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
-from llmcompressor.transformers.utils.data_collator import whisper_data_collator
+from llmcompressor.transformers.utils.data_collator import qwen2_audio_data_collator
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -36,20 +35,21 @@
 
 
 def preprocess(example):
-    conversation = [
-        {"role": "user", "content": [
-            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
-        ]},
-        {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
-        {"role": "user", "content": [
-            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
-        ]},
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "audio", "audio": None},
+                {"type": "text", "text": "What does the person say?"},
+            ],
+        },
     ]
+
     return {
-        "text": processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-    }
-    return {
-        "array": example["audio"]["array"],
+        "text": processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=False
+        ),
+        "audios": [example["audio"]["array"]],
         "sampling_rate": example["audio"]["sampling_rate"],
     }
 
@@ -59,30 +59,7 @@ def preprocess(example):
 
 # Tokenize inputs.
 def tokenize(sample):
-    batch_size = 1
-    input_features = processor(
-        sample["array"],
-        sampling_rate=sample["sampling_rate"],
-        return_tensors="pt",
-    ).input_features
-
-    generation_config, _kwargs = model._prepare_generation_config(None)
-
-    input_stride = (
-        model.model.encoder.conv1.stride[0] * model.model.encoder.conv2.stride[0]
-    )
-    num_segment_frames = input_stride * model.config.max_source_positions
-
-    decoder_input_ids = model._retrieve_init_tokens(
-        input_features,
-        batch_size=batch_size,
-        generation_config=generation_config,
-        config=model.config,
-        num_segment_frames=num_segment_frames,
-        kwargs={},
-    )
-
-    return {"input_features": input_features, "decoder_input_ids": decoder_input_ids}
+    return processor(**sample, return_tensors="pt")
 
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
@@ -98,19 +75,19 @@ def tokenize(sample):
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    data_collator=whisper_data_collator,
+    data_collator=qwen2_audio_data_collator,
 )
 
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-sample_input = whisper_data_collator([next(iter(ds))])
-sample_input = {k: v.to("cuda:0") for k, v in sample_input.items()}
-output = model.generate(**sample_input, language="en")
+breakpoint()
+sample_input = qwen2_audio_data_collator([next(iter(ds))])
+sample_input = {k: v.to(model.device) for k, v in sample_input.items()}
+output = model.generate(**sample_input)
 print(processor.batch_decode(output, skip_special_tokens=True)[0])
 print("==========================================\n\n")
-# If you are interested in doing something to your house, go to the green building
-# adviser, look it up and see what the experts are talking about
+# that's where you have a lot of windows in the
 
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index ad7830d6f..55c9a2fab 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -92,7 +92,7 @@ def process(sample):
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 sample_input = whisper_data_collator([next(iter(ds))])
-sample_input = {k: v.to("cuda:0") for k, v in sample_input.items()}
+sample_input = {k: v.to(model.device) for k, v in sample_input.items()}
 output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
 print("==========================================\n\n")
diff --git a/src/llmcompressor/transformers/tracing/qwen2_audio.py b/src/llmcompressor/transformers/tracing/qwen2_audio.py
new file mode 100644
index 000000000..42fca41b9
--- /dev/null
+++ b/src/llmcompressor/transformers/tracing/qwen2_audio.py
@@ -0,0 +1,1368 @@
+# flake8: noqa
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# vllm-project: no copyright
+"""PyTorch Qwen2Audio model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, EncoderDecoderCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_qwen2_audio import Qwen2AudioConfig, Qwen2AudioEncoderConfig
+
+
+if is_flash_attn_2_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Qwen2AudioConfig"
+
+
+@dataclass
+class Qwen2AudioCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Qwen2Audio causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        attention_mask (`torch.FloatTensor`, *optional*):
+            Attentions mask, used to update attention mask and position_ids.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attention_mask: Optional[torch.FloatTensor] = None
+
+
+# Copied from transformers.models.whisper.modeling_whisper.WhisperAttention with Whisper->Qwen2Audio
+class Qwen2AudioAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        layer_idx: Optional[int] = None,
+        config: Optional[Qwen2AudioConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        if layer_idx is None and is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention._shape with BART->whisper
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_probs, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.whisper.modeling_whisper.WhisperFlashAttention2 with Whisper->Qwen2Audio
+class Qwen2AudioFlashAttention2(Qwen2AudioAttention):
+    """
+    Qwen2Audio flash attention module. This module inherits from `Qwen2AudioAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "The `static` cache implementation is not compatible with `attn_implementation='flash_attention_2'`. "
+                "Use `attn_implementation='sdpa'` in the meantime, and open an issue at https://github.com/huggingface/transformers"
+            )
+        # Qwen2AudioFlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            raise ValueError("Qwen2AudioFlashAttention2 attention does not support output_attentions")
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = torch.reshape(self.q_proj(hidden_states), (bsz, tgt_len, self.num_heads, self.head_dim))
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]
+        #  We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view.
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, : key_states.shape[-2]]
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            causal_mask,
+            tgt_len,
+            dropout=self.dropout if self.training else 0.0,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.whisper.modeling_whisper.WhisperSdpaAttention with Whisper->Qwen2Audio
+class Qwen2AudioSdpaAttention(Qwen2AudioAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2AudioModel is using Qwen2AudioSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and causal_mask is None and tgt_len > 1 else False
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+QWEN2AUDIO_ATTENTION_CLASSES = {
+    "eager": Qwen2AudioAttention,
+    "flash_attention_2": Qwen2AudioFlashAttention2,
+    "sdpa": Qwen2AudioSdpaAttention,
+}
+
+
+# Copied from transformers.models.whisper.modeling_whisper.WhisperEncoderLayer with Whisper->Qwen2Audio, WHISPER->QWEN2AUDIO
+class Qwen2AudioEncoderLayer(nn.Module):
+    def __init__(self, config: Qwen2AudioConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = QWEN2AUDIO_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+QWEN2AUDIO_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2AudioConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2Audio Model outputting raw hidden-states without any specific head on top.",
+    QWEN2AUDIO_START_DOCSTRING,
+)
+class Qwen2AudioPreTrainedModel(PreTrainedModel):
+    config_class = Qwen2AudioConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2AudioAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Qwen2Audio isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed
+        std = self.config.init_std if hasattr(self.config, "init_std") else self.config.audio_config.init_std
+
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2AUDIOENCODER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2AudioEncoderConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """The audio model from Qwen2Audio without any head or projection on top.""",
+    QWEN2AUDIOENCODER_START_DOCSTRING,
+)
+# Copied from transformers.models.whisper.modeling_whisper.WhisperEncoder with Whisper->Qwen2Audio
+class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`Qwen2AudioEncoderLayer`].
+
+    Args:
+        config: Qwen2AudioEncoderConfig
+    """
+
+    # Ignore copy
+    config_class = Qwen2AudioEncoderConfig
+    main_input_name = "input_features"
+    _no_split_modules = ["Qwen2AudioEncoderLayer"]
+
+    def __init__(self, config: Qwen2AudioEncoderConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+
+        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
+        self.embed_positions.requires_grad_(False)
+
+        self.layers = nn.ModuleList([Qwen2AudioEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+        # Ignore copy
+        self.avg_pooler = nn.AvgPool1d(2, stride=2)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Qwen2Audio does not support masking of the `input_features`, this argument is preserved for compatibility,
+                but it is not used. By default the silence in the input log mel spectrogram are ignored.
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        if input_features.shape[-1] != expected_seq_length:
+            raise ValueError(
+                f"Qwen2Audio expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Ignore copy
+        input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device)
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Ignore copy
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.avg_pooler(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+
+
+class Qwen2AudioMultiModalProjector(nn.Module):
+    def __init__(self, config: Qwen2AudioConfig):
+        super().__init__()
+        self.linear = nn.Linear(config.audio_config.d_model, config.text_config.hidden_size, bias=True)
+
+    def forward(self, audio_features):
+        hidden_states = self.linear(audio_features)
+        return hidden_states
+
+
+QWEN2AUDIO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`):
+            Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """The QWEN2AUDIO model which consists of a audio backbone and a language model.""",
+    QWEN2AUDIO_START_DOCSTRING,
+)
+class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMixin):
+    def __init__(self, config: Qwen2AudioConfig):
+        super().__init__(config)
+        self.audio_tower = AutoModel.from_config(config.audio_config)
+
+        self.multi_modal_projector = Qwen2AudioMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
+        self.post_init()
+
+    @property
+    def padding_side(self):
+        return self._padding_side
+
+    @padding_side.setter
+    def padding_side(self, padding_side: str):
+        if padding_side not in ["left", "right"]:
+            raise ValueError(f"{padding_side} is not `left` or `right`.")
+        self._padding_side = padding_side
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+
+    def _merge_input_ids_with_audio_features(
+        self, audio_features, num_audio_tokens, inputs_embeds, input_ids, attention_mask, labels
+    ):
+        """
+        Merge input_ids with with audio features into final embeddings
+
+        Args:
+            audio_features (`torch.Tensor` of shape `(num_audios, max_audio_tokens, embed_dim)`):
+                All audio vectors of all audios in the batch
+            num_audio_tokens (`torch.LongTensor` of shape `(num_audios)`):
+                The length of audio embeddings of each audio as stacked in `audio_features`
+            inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, embed_dim)`):
+                Token embeddings before merging with audio embeddings
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Input_ids of tokens, possibly filled with audio token
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Mask to avoid performing attention on padding token indices.
+            labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*)
+                labels need to be recalculated to support training (if provided)
+        Returns:
+            final_embedding, final_attention_mask, final_labels, position_ids, final_input_ids
+
+        Explanation:
+            each audio has variable length embeddings, with length specified by num_audio_tokens
+            audio_features is concatenation of all audio embed vectors
+            task: fill each <|AUDIO|> with the correct number of audio embeddings
+            Example:
+                X (5 tokens), Y (3 tokens), Z (8 tokens)
+                X, Y are in the same sequence (in-context learning)
+            if right padding
+                input_ids: [
+                    a b c d e f X g h i j k Y l m
+                    o p q r Z s t u v _ _ _ _ _ _
+                ]
+                input_ids should be: [
+                    a b c d e f X X X X X g h i j k Y Y Y l m
+                    o p q r Z Z Z Z Z Z Z Z s t u v _ _ _ _ _
+                ]
+                labels should be: [
+                    a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
+                    o p q r _ _ _ _ _ _ _ _ s t u v _ _ _ _ _
+                ]
+            elif left padding
+                input_ids: [
+                    a b c d e f X g h i j k Y l m
+                    _ _ _ _ _ _ o p q r Z s t u v
+                ]
+                input_ids should be: [
+                    a b c d e f X X X X X g h i j k Y Y Y l m
+                    _ _ _ _ _ o p q r Z Z Z Z Z Z Z Z s t u v
+                ]
+                labels should be: [
+                    a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
+                    _ _ _ _ _ o p q r _ _ _ _ _ _ _ _ s t u v
+                ]
+            Edge cases:
+                * If tokens are same but audio token sizes are different, then cannot infer left or right padding
+                ```python
+                url1 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
+                audio1, _ = librosa.load(BytesIO(urlopen(url1).read()), sr=processor.feature_extractor.sampling_rate)
+                url2 = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"
+                audio2, _ = librosa.load(BytesIO(urlopen(url2).read()), sr=processor.feature_extractor.sampling_rate)
+                prompts = [
+                    "[INST] <|AUDIO|>\nWhat is that in this audio? [/INST]",
+                    "[INST] <|AUDIO|>\nWhat is that in this audio? [/INST]",
+                ]
+                inputs = processor(text=prompts, audios=[audio1, audio2], return_tensors='pt', padding=True).to("cuda")
+                    audio1 has 101 tokens, while audio2 has 72 tokens
+                ```
+
+                input_ids: [
+                    a b c d X g h
+                    i j Y k l m n
+                ]
+                where X is 3 tokens while Y is 5, this mean after merge
+                if left-padding (batched generation)
+                    input_ids should be: [
+                        _ _ a b c d X X X g h
+                        i j Y Y Y Y Y k l m n
+                    ]
+                elif (right padding) (training)
+                    input_ids should be: [
+                        a b c d X X X g h _ _
+                        i j Y Y Y Y Y k l m n
+                    ]
+        """
+        num_audios, max_audio_tokens, embed_dim = audio_features.shape
+        audio_features_mask = torch.arange(max_audio_tokens).expand(num_audios, max_audio_tokens).to(
+            num_audio_tokens.device
+        ) < num_audio_tokens.unsqueeze(1)
+        masked_audio_features = audio_features[audio_features_mask].view(-1, embed_dim)
+        batch_size, sequence_length = input_ids.shape
+        _left_padding = torch.any(attention_mask[:, 0] == 0)
+        _right_padding = torch.any(attention_mask[:, -1] == 0)
+
+        left_padding = True
+        if batch_size > 1:
+            if _left_padding and not _right_padding:
+                left_padding = True
+            elif not _left_padding and _right_padding:
+                left_padding = False
+            elif not _left_padding and not _right_padding:
+                # both side is 1, so cannot tell
+                left_padding = self.padding_side == "left"
+            else:
+                # invalid attention_mask
+                raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
+
+        # 1. Create a mask to know where special audio tokens are
+        special_audio_token_mask = input_ids == self.config.audio_token_index
+        num_special_audio_tokens = torch.sum(special_audio_token_mask, dim=-1)
+
+        # In case the Audio model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        attention_mask = attention_mask.to(target_device)
+        input_ids = input_ids.to(target_device)
+        num_audio_tokens = num_audio_tokens.to(target_device)
+        batch_indices, non_audio_indices = torch.where(
+            (input_ids != self.config.audio_token_index) & (attention_mask == 1)
+        )
+
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged audio-text sequence.
+        # `special_audio_token_mask` identifies audio tokens. Each audio token will be replaced by `audio_feat_lengths - 1` text tokens.
+        # `torch.cumsum` computes how each audio token shifts subsequent text token positions.
+        token_placeholder_num = torch.zeros_like(input_ids)
+        token_placeholder_num[special_audio_token_mask] = num_audio_tokens.long() - 1
+        token_placeholder_num = token_placeholder_num + 1
+        new_token_positions = torch.cumsum(token_placeholder_num, -1) - 1
+        max_token_num = token_placeholder_num.sum(-1).max()
+        nb_audio_pad = max_token_num - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_audio_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_audio_indices]
+        batch_indices, non_audio_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_audio_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_token_num, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_token_num, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        final_input_ids = torch.full(
+            (batch_size, max_token_num), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device
+        )
+
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<audio>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the audio features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_audio_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_audio_indices]
+        final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_audio_indices]
+        final_labels = None
+        if labels is not None:
+            labels = labels.to(target_device)
+            final_labels = torch.full_like(final_attention_mask, self.config.ignore_index).to(torch.long)
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_audio_indices]
+
+        # 5. Fill the embeddings corresponding to the audios. Anything that is still zeros needs filling
+        audio_to_overwrite = torch.full(
+            (batch_size, max_token_num), True, dtype=torch.bool, device=inputs_embeds.device
+        )
+        audio_to_overwrite[batch_indices, text_to_overwrite] = False
+        seq_indices = torch.arange(max_token_num).unsqueeze(0).to(target_device)
+        seq_indices = seq_indices.expand(batch_size, max_token_num)
+
+        if left_padding:
+            # exclude padding on the left
+            max_token_num = max_token_num.to(target_device)
+            val = (max_token_num - seq_indices) <= (
+                token_placeholder_num.sum(-1) - (attention_mask == 0).long().sum(-1)
+            )[:, None]
+        else:
+            # exclude padding on the right
+            val = seq_indices < (token_placeholder_num.sum(-1) - (attention_mask == 0).long().sum(-1))[:, None]
+
+        audio_to_overwrite &= val
+
+        if audio_to_overwrite.sum() != num_audio_tokens.sum():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of audio tokens is {num_special_audio_tokens} while"
+                f" the number of audio given to the model is {num_audios}. This prevents correct indexing and breaks batch generation."
+            )
+
+        final_embedding[audio_to_overwrite] = (
+            masked_audio_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        )
+        final_attention_mask |= audio_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+
+        return final_embedding, final_attention_mask, final_labels, position_ids, final_input_ids
+
+    @add_start_docstrings_to_model_forward(QWEN2AUDIO_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Qwen2AudioCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        input_features: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        feature_attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Qwen2AudioCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from io import BytesIO
+        >>> from urllib.request import urlopen
+        >>> import librosa
+        >>> from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+
+        >>> model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B")
+        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B")
+
+        >>> prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:"
+        >>> url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"
+        >>> audio, _ = librosa.load(BytesIO(urlopen(url).read()), sr=self.processor.feature_extractor.sampling_rate)
+
+        >>> inputs = processor(text=prompt, audios=audio, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Generate the caption in English: Glass is breaking."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        target_device = self.audio_tower.device
+
+        if input_features is not None:
+            input_features = input_features.to(target_device)
+            feature_attention_mask = feature_attention_mask.to(target_device)
+
+        if inputs_embeds is None:
+            # 1. Extract the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            # 2. Merge text and audios
+            if input_features is not None and input_ids.shape[1] != 1:
+                audio_feat_lengths, audio_output_lengths = self.audio_tower._get_feat_extract_output_lengths(
+                    feature_attention_mask.sum(-1)
+                )
+                batch_size, _, max_mel_seq_len = input_features.shape
+                max_seq_len = (max_mel_seq_len - 2) // 2 + 1
+                # Create a sequence tensor of shape (batch_size, max_seq_len)
+                seq_range = (
+                    torch.arange(0, max_seq_len, dtype=audio_feat_lengths.dtype, device=audio_feat_lengths.device)
+                    .unsqueeze(0)
+                    .expand(batch_size, max_seq_len)
+                )
+                lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
+                # Create mask
+                padding_mask = seq_range >= lengths_expand
+
+                audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+                    batch_size, 1, max_seq_len, max_seq_len
+                )
+                audio_attention_mask = audio_attention_mask_.to(
+                    dtype=self.audio_tower.conv1.weight.dtype, device=self.audio_tower.conv1.weight.device
+                )
+                audio_attention_mask[audio_attention_mask_] = float("-inf")
+
+                audio_outputs = self.audio_tower(input_features, attention_mask=audio_attention_mask)
+                selected_audio_feature = audio_outputs.last_hidden_state
+                audio_features = self.multi_modal_projector(selected_audio_feature)
+
+                inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
+                    audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
+                )
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Qwen2AudioCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            attention_mask=attention_mask,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        input_features=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom processing (note: might not be needed, but there are no generation tests running atm)
+
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+
+            # Here, we get the attention_mask, which was previously stored in the state after _merge_input_ids_with_audio_features.
+            if input_features is not None and kwargs.get("attention_mask") is not None:
+                attention_mask = kwargs["attention_mask"]
+                attention_mask = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.audio_token_index in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        feature_attention_mask = kwargs.get("feature_attention_mask", None)
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "input_features": input_features,
+                "feature_attention_mask": feature_attention_mask,
+            }
+        )
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        # update past_key_values keeping its naming used in model code
+        cache_name, cache = self._extract_past_from_model_output(outputs)
+        model_kwargs[cache_name] = cache
+        if getattr(outputs, "state", None) is not None:
+            model_kwargs["state"] = outputs.state
+
+        # update attention_mask
+        if getattr(outputs, "attention_mask", None) is not None:
+            model_kwargs["attention_mask"] = outputs.attention_mask
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+        if not is_encoder_decoder:
+            # update attention mask
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+        else:
+            # update decoder attention mask
+            if "decoder_attention_mask" in model_kwargs:
+                decoder_attention_mask = model_kwargs["decoder_attention_mask"]
+                model_kwargs["decoder_attention_mask"] = torch.cat(
+                    [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))],
+                    dim=-1,
+                )
+
+        if model_kwargs.get("use_cache", True):
+            model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
+        else:
+            past_positions = model_kwargs.pop("cache_position")
+            new_positions = torch.arange(
+                past_positions[-1] + 1, past_positions[-1] + num_new_tokens + 1, dtype=past_positions.dtype
+            ).to(past_positions.device)
+            model_kwargs["cache_position"] = torch.cat((past_positions, new_positions))
+        return model_kwargs
+
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)
diff --git a/src/llmcompressor/transformers/utils/data_collator.py b/src/llmcompressor/transformers/utils/data_collator.py
index 111e12b07..f0c70cf84 100644
--- a/src/llmcompressor/transformers/utils/data_collator.py
+++ b/src/llmcompressor/transformers/utils/data_collator.py
@@ -54,3 +54,13 @@ def whisper_data_collator(batch):
         "input_features": torch.tensor(batch[0]["input_features"]),
         "decoder_input_ids": torch.tensor(batch[0]["decoder_input_ids"]),
     }
+
+
+def qwen2_audio_data_collator(batch):
+    assert len(batch) == 1
+    return {
+        "input_ids": torch.LongTensor(batch[0]["input_ids"]),
+        "attention_mask": torch.tensor(batch[0]["attention_mask"]),
+        "input_features": torch.tensor(batch[0]["input_features"]),
+        "feature_attention_mask": torch.tensor(batch[0]["feature_attention_mask"]),
+    }
\ No newline at end of file

From fbb6322007da3b569eed646e14c8192bbc89774c Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 19 Jan 2025 06:39:58 +0000
Subject: [PATCH 08/21] WIP

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../multimodal_audio/qwen2_audio_example.py   | 11 +++++++---
 .../modifiers/quantization/gptq/base.py       |  2 ++
 .../transformers/tracing/__init__.py          |  4 ++++
 .../transformers/tracing/qwen2_audio.py       | 22 ++++++++++---------
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
index 2f0f1cc35..9abda5f36 100644
--- a/examples/multimodal_audio/qwen2_audio_example.py
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -1,14 +1,15 @@
 from datasets import load_dataset
-from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+from transformers import AutoProcessor
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.utils.data_collator import qwen2_audio_data_collator
+from llmcompressor.transformers.tracing import TraceableQwen2AudioForConditionalGeneration
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 
-model = Qwen2AudioForConditionalGeneration.from_pretrained(
+model = TraceableQwen2AudioForConditionalGeneration.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",
@@ -66,7 +67,11 @@ def tokenize(sample):
 
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
-recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+recipe = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16",
+    ignore=["re:audio_tower.*", "re:multi_modal_projector.*", "lm_head"]  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
+)
 
 # Apply algorithms.
 oneshot(
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 8519ca0d6..df40ebb62 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -247,6 +247,8 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
             if isinstance(exception, unfixable_errors):
                 raise exception
 
+            raise exception
+
             warnings.warn("Falling back to layer_sequential pipeline")
             try:
                 run_layer_sequential(
diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py
index b9202d95b..fee345733 100644
--- a/src/llmcompressor/transformers/tracing/__init__.py
+++ b/src/llmcompressor/transformers/tracing/__init__.py
@@ -8,10 +8,14 @@
 from .whisper import (
     WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration
 )
+from .qwen2_audio import (
+    Qwen2AudioForConditionalGeneration as TraceableQwen2AudioForConditionalGeneration
+)
 
 __all__ = [
     "TraceableLlavaForConditionalGeneration",
     "TraceableMllamaForConditionalGeneration",
     "TraceableMistralForCausalLM",
     "TraceableWhisperForConditionalGeneration",
+    "TraceableQwen2AudioForConditionalGeneration",
 ]
diff --git a/src/llmcompressor/transformers/tracing/qwen2_audio.py b/src/llmcompressor/transformers/tracing/qwen2_audio.py
index 42fca41b9..06dc1ac8e 100644
--- a/src/llmcompressor/transformers/tracing/qwen2_audio.py
+++ b/src/llmcompressor/transformers/tracing/qwen2_audio.py
@@ -24,12 +24,12 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from ...activations import ACT2FN
-from ...cache_utils import Cache, EncoderDecoderCache, StaticCache
-from ...generation import GenerationMixin
-from ...modeling_outputs import BaseModelOutput, ModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, EncoderDecoderCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import BaseModelOutput, ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
@@ -37,12 +37,12 @@
     logging,
     replace_return_docstrings,
 )
-from ..auto import AutoModel, AutoModelForCausalLM
-from .configuration_qwen2_audio import Qwen2AudioConfig, Qwen2AudioEncoderConfig
+from transformers import AutoModel, AutoModelForCausalLM
+from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioConfig, Qwen2AudioEncoderConfig
 
 
 if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
 
 
 logger = logging.get_logger(__name__)
@@ -1092,7 +1092,9 @@ def _merge_input_ids_with_audio_features(
 
         audio_to_overwrite &= val
 
-        if audio_to_overwrite.sum() != num_audio_tokens.sum():
+        # TRACING
+        #if audio_to_overwrite.sum() != num_audio_tokens.sum():
+        if False:
             raise ValueError(
                 f"The input provided to the model are wrong. The number of audio tokens is {num_special_audio_tokens} while"
                 f" the number of audio given to the model is {num_audios}. This prevents correct indexing and breaks batch generation."

From f5daa3ddf325d55e0c5aff8f0be6fc7081374a88 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 19:25:44 +0000
Subject: [PATCH 09/21] use model_input_names

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../multimodal_audio/qwen2_audio_example.py   | 10 ++++--
 examples/multimodal_audio/whisper_example.py  |  2 +-
 .../transformers/finetune/data/base.py        |  7 ++--
 .../transformers/utils/data_collator.py       |  2 +-
 .../finetune/data/test_dataset_helpers.py     | 35 +++++++++++++++++++
 5 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
index 9abda5f36..b27e79e39 100644
--- a/examples/multimodal_audio/qwen2_audio_example.py
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -3,8 +3,10 @@
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import (
+    TraceableQwen2AudioForConditionalGeneration,
+)
 from llmcompressor.transformers.utils.data_collator import qwen2_audio_data_collator
-from llmcompressor.transformers.tracing import TraceableQwen2AudioForConditionalGeneration
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -70,7 +72,11 @@ def tokenize(sample):
 recipe = GPTQModifier(
     targets="Linear",
     scheme="W4A16",
-    ignore=["re:audio_tower.*", "re:multi_modal_projector.*", "lm_head"]  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
+    ignore=[
+        "re:audio_tower.*",
+        "re:multi_modal_projector.*",
+        "lm_head",
+    ],  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
 )
 
 # Apply algorithms.
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 55c9a2fab..1a167a2f6 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -7,7 +7,7 @@
 from llmcompressor.transformers.utils.data_collator import whisper_data_collator
 
 # Select model and load it.
-MODEL_ID = "openai/whisper-base"
+MODEL_ID = "openai/whisper-large-v2"
 
 model = TraceableWhisperForConditionalGeneration.from_pretrained(
     MODEL_ID,
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index 10e014949..345649e93 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -105,10 +105,9 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
         dataset = self.rename_columns(dataset)
         logger.debug(f"Dataset after column renaming: {get_columns(dataset)}")
 
-        # TODO: investigate processor.model_input_names
-        if "input_ids" not in get_columns(
-            dataset
-        ) and "input_features" not in get_columns(dataset):
+        model_input_names = getattr(self.processor, "model_input_names", ["input_ids"])
+        column_names = get_columns(dataset)
+        if any(name in model_input_names for name in column_names):
             # tokenize/ process
             dataset = self.filter_tokenizer_args(dataset)
             logger.debug(f"Tokenizer args after filtering: {get_columns(dataset)}")
diff --git a/src/llmcompressor/transformers/utils/data_collator.py b/src/llmcompressor/transformers/utils/data_collator.py
index f0c70cf84..04567d947 100644
--- a/src/llmcompressor/transformers/utils/data_collator.py
+++ b/src/llmcompressor/transformers/utils/data_collator.py
@@ -63,4 +63,4 @@ def qwen2_audio_data_collator(batch):
         "attention_mask": torch.tensor(batch[0]["attention_mask"]),
         "input_features": torch.tensor(batch[0]["input_features"]),
         "feature_attention_mask": torch.tensor(batch[0]["feature_attention_mask"]),
-    }
\ No newline at end of file
+    }
diff --git a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
index 812b26a56..4ba4bfac4 100644
--- a/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
+++ b/tests/llmcompressor/transformers/finetune/data/test_dataset_helpers.py
@@ -1,4 +1,5 @@
 import pytest
+from transformers import AutoProcessor
 
 from llmcompressor.transformers.finetune.data.data_args import DataTrainingArguments
 from llmcompressor.transformers.finetune.data.data_helpers import (
@@ -53,3 +54,37 @@ def test_separate_datasets():
         split_datasets = make_dataset_splits(
             datasets, do_train=True, do_eval=True, do_predict=True
         )
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize(
+    "model_id,expected",
+    [
+        ("meta-llama/Meta-Llama-3-8B-Instruct", ["input_ids", "attention_mask"]),
+        ("mistralai/Mixtral-8x7B-Instruct-v0.1", ["input_ids", "attention_mask"]),
+        (
+            "Qwen/Qwen2-VL-2B-Instruct",
+            [
+                "input_ids",
+                "attention_mask",
+                "pixel_values",
+                "image_grid_thw",
+                "pixel_values_videos",
+                "video_grid_thw",
+            ],
+        ),
+        ("mgoin/pixtral-12b", ["input_ids", "attention_mask", "pixel_values"]),
+        ("openai/whisper-large-v2", ["input_features"]),
+        (
+            "Qwen/Qwen2-Audio-7B-Instruct",
+            ["input_ids", "attention_mask", "input_features", "feature_attention_mask"],
+        ),
+    ],
+)
+def test_processor_model_input_names(model_id, expected):
+    """
+    Tests the model_input_names attribute of common model processors
+    """
+
+    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+    assert processor.model_input_names == expected

From fa69e8abd3cb7ca9e8f7389f194f8aa40d65cdc2 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 19:30:43 +0000
Subject: [PATCH 10/21] remove debug statements

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modifiers/quantization/gptq/base.py | 2 --
 src/llmcompressor/modifiers/utils/pytorch_helpers.py  | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index df40ebb62..8519ca0d6 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -247,8 +247,6 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
             if isinstance(exception, unfixable_errors):
                 raise exception
 
-            raise exception
-
             warnings.warn("Falling back to layer_sequential pipeline")
             try:
                 run_layer_sequential(
diff --git a/src/llmcompressor/modifiers/utils/pytorch_helpers.py b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
index 444a0bac2..50234461c 100644
--- a/src/llmcompressor/modifiers/utils/pytorch_helpers.py
+++ b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
@@ -41,7 +41,6 @@ def apply_pad_mask_to_batch(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     :param batch: batch to apply padding to if it exists
     :return: batch with padding zeroed out in the input_ids
     """
-    print(batch.keys())
     if "input_ids" in batch and "attention_mask" in batch:
         batch["input_ids"] = batch["input_ids"] * batch["attention_mask"]
     return batch

From 504c7dc0f2af91d0018f8e7ae3e5efc38c8ea815 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 20:41:33 +0000
Subject: [PATCH 11/21] simplify example, fix tokenizer condition

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/whisper_example.py  | 41 ++++++++++---------
 .../transformers/finetune/data/base.py        |  2 +-
 .../transformers/utils/data_collator.py       |  1 +
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 1a167a2f6..2c18597a5 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -1,3 +1,4 @@
+import torch
 from datasets import load_dataset
 from transformers import WhisperProcessor
 
@@ -17,6 +18,9 @@
 model.config.forced_decoder_ids = None
 processor = WhisperProcessor.from_pretrained(MODEL_ID)
 
+# Configure processor the dataset task.
+processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
+
 # Select calibration dataset.
 DATASET_ID = "MLCommons/peoples_speech"
 DATASET_SUBSET = "test"
@@ -24,7 +28,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
+NUM_CALIBRATION_SAMPLES = 1  # 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -40,6 +44,7 @@ def preprocess(example):
     return {
         "array": example["audio"]["array"],
         "sampling_rate": example["audio"]["sampling_rate"],
+        "text": " " + example["text"].capitalize(),
     }
 
 
@@ -48,28 +53,19 @@ def preprocess(example):
 
 # Process inputs.
 def process(sample):
-    input_features = processor(
-        sample["array"],
+    audio_inputs = processor(
+        audio=sample["array"],
         sampling_rate=sample["sampling_rate"],
         return_tensors="pt",
-    ).input_features
-
-    # decoder_input_ids define the task context
-    generation_config, _kwargs = model._prepare_generation_config(None)
-    input_stride = (
-        model.model.encoder.conv1.stride[0] * model.model.encoder.conv2.stride[0]
     )
-    num_segment_frames = input_stride * model.config.max_source_positions
-    decoder_input_ids = model._retrieve_init_tokens(
-        input_features,
-        batch_size=1,
-        generation_config=generation_config,
-        config=model.config,
-        num_segment_frames=num_segment_frames,
-        kwargs={},
+
+    text_inputs = processor(
+        text=sample["text"], add_special_tokens=True, return_tensors="pt"
     )
+    text_inputs["decoder_input_ids"] = text_inputs["input_ids"]
+    del text_inputs["input_ids"]
 
-    return {"input_features": input_features, "decoder_input_ids": decoder_input_ids}
+    return dict(**audio_inputs, **text_inputs)
 
 
 ds = ds.map(process, remove_columns=ds.column_names)
@@ -91,8 +87,13 @@ def process(sample):
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
-sample_input = whisper_data_collator([next(iter(ds))])
-sample_input = {k: v.to(model.device) for k, v in sample_input.items()}
+sample_features = next(iter(ds))["input_features"]
+sample_decoder_ids = [processor.tokenizer.prefix_tokens]
+sample_input = {
+    "input_features": torch.tensor(sample_features).to(model.device),
+    "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
+}
+
 output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
 print("==========================================\n\n")
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index 345649e93..63f81e351 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -107,7 +107,7 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
 
         model_input_names = getattr(self.processor, "model_input_names", ["input_ids"])
         column_names = get_columns(dataset)
-        if any(name in model_input_names for name in column_names):
+        if not any(name in model_input_names for name in column_names):
             # tokenize/ process
             dataset = self.filter_tokenizer_args(dataset)
             logger.debug(f"Tokenizer args after filtering: {get_columns(dataset)}")
diff --git a/src/llmcompressor/transformers/utils/data_collator.py b/src/llmcompressor/transformers/utils/data_collator.py
index 04567d947..f819f43d4 100644
--- a/src/llmcompressor/transformers/utils/data_collator.py
+++ b/src/llmcompressor/transformers/utils/data_collator.py
@@ -53,6 +53,7 @@ def whisper_data_collator(batch):
     return {
         "input_features": torch.tensor(batch[0]["input_features"]),
         "decoder_input_ids": torch.tensor(batch[0]["decoder_input_ids"]),
+        "attention_mask": torch.tensor(batch[0]["attention_mask"]),
     }
 
 

From e2f3735ed8d2d9e00c408dda9832bddf5cfb4696 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 20:43:25 +0000
Subject: [PATCH 12/21] shorten

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/transformers/finetune/data/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index 63f81e351..f4cd6120e 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -106,8 +106,7 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
         logger.debug(f"Dataset after column renaming: {get_columns(dataset)}")
 
         model_input_names = getattr(self.processor, "model_input_names", ["input_ids"])
-        column_names = get_columns(dataset)
-        if not any(name in model_input_names for name in column_names):
+        if not any(col_name in model_input_names for col_name in get_columns(dataset)):
             # tokenize/ process
             dataset = self.filter_tokenizer_args(dataset)
             logger.debug(f"Tokenizer args after filtering: {get_columns(dataset)}")

From 9b34135e635048151eb995f082b692a00e1a293d Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 20:44:38 +0000
Subject: [PATCH 13/21] restore example

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/whisper_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 2c18597a5..f3f83740b 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -28,7 +28,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 1  # 512
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.

From 2f3a416afcf08b82a0680370ea1926a814e485ca Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 20:46:26 +0000
Subject: [PATCH 14/21] support audio datasets

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modifiers/utils/pytorch_helpers.py | 3 ++-
 src/llmcompressor/transformers/finetune/data/base.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/modifiers/utils/pytorch_helpers.py b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
index c9869f267..50234461c 100644
--- a/src/llmcompressor/modifiers/utils/pytorch_helpers.py
+++ b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
@@ -41,7 +41,8 @@ def apply_pad_mask_to_batch(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     :param batch: batch to apply padding to if it exists
     :return: batch with padding zeroed out in the input_ids
     """
-    batch["input_ids"] = batch["input_ids"] * batch["attention_mask"]
+    if "input_ids" in batch and "attention_mask" in batch:
+        batch["input_ids"] = batch["input_ids"] * batch["attention_mask"]
     return batch
 
 
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
index 81a3fc95f..f4cd6120e 100644
--- a/src/llmcompressor/transformers/finetune/data/base.py
+++ b/src/llmcompressor/transformers/finetune/data/base.py
@@ -105,7 +105,8 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
         dataset = self.rename_columns(dataset)
         logger.debug(f"Dataset after column renaming: {get_columns(dataset)}")
 
-        if "input_ids" not in get_columns(dataset):
+        model_input_names = getattr(self.processor, "model_input_names", ["input_ids"])
+        if not any(col_name in model_input_names for col_name in get_columns(dataset)):
             # tokenize/ process
             dataset = self.filter_tokenizer_args(dataset)
             logger.debug(f"Tokenizer args after filtering: {get_columns(dataset)}")

From 74283e8ea4ac94373854fa8e7bb670970833627f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 20:50:39 +0000
Subject: [PATCH 15/21] mask decoder_input_ids

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 src/llmcompressor/modifiers/utils/pytorch_helpers.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/llmcompressor/modifiers/utils/pytorch_helpers.py b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
index 50234461c..809507a4b 100644
--- a/src/llmcompressor/modifiers/utils/pytorch_helpers.py
+++ b/src/llmcompressor/modifiers/utils/pytorch_helpers.py
@@ -41,8 +41,14 @@ def apply_pad_mask_to_batch(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.T
     :param batch: batch to apply padding to if it exists
     :return: batch with padding zeroed out in the input_ids
     """
-    if "input_ids" in batch and "attention_mask" in batch:
-        batch["input_ids"] = batch["input_ids"] * batch["attention_mask"]
+    if "attention_mask" in batch:
+        if "input_ids" in batch:
+            batch["input_ids"] = batch["input_ids"] * batch["attention_mask"]
+        if "decoder_input_ids" in batch:
+            batch["decoder_input_ids"] = (
+                batch["decoder_input_ids"] * batch["attention_mask"]
+            )
+
     return batch
 
 

From 36ec9f05880a65b459bcbb187d5dc3628e0cb989 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 20 Jan 2025 21:43:35 +0000
Subject: [PATCH 16/21] update example sample

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/qwen2_audio_example.py | 4 +++-
 examples/multimodal_audio/whisper_example.py     | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
index b27e79e39..930d57302 100644
--- a/examples/multimodal_audio/qwen2_audio_example.py
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -98,7 +98,9 @@ def tokenize(sample):
 output = model.generate(**sample_input)
 print(processor.batch_decode(output, skip_special_tokens=True)[0])
 print("==========================================\n\n")
-# that's where you have a lot of windows in the
+# that's where you have a lot of windows in the south no actually that's passive solar
+# and passive solar is something that was developed and designed in the 1960s and 70s
+# and it was a great thing for what it was at the time but it's not a passive house
 
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index f3f83740b..4b652b456 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -97,7 +97,9 @@ def process(sample):
 output = model.generate(**sample_input, language="en")
 print(processor.batch_decode(output, skip_special_tokens=True))
 print("==========================================\n\n")
-# The track appears on the compilation album "Kraftworks"
+# that's where you have a lot of windows in the south no actually that's passive solar
+# and passive solar is something that was developed and designed in the 1960s and 70s
+# and it was a great thing for what it was at the time but it's not a passive house
 
 # Save to disk compressed.
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"

From d11af96ff46873149f68ab98148c1b0d383ff8bb Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 21 Jan 2025 01:01:44 -0500
Subject: [PATCH 17/21] asdf

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../multimodal_audio/qwen2_audio_example.py   | 17 ++++++++++++-----
 examples/multimodal_audio/whisper_example.py  | 10 ++++++++--
 .../transformers/utils/data_collator.py       | 19 -------------------
 3 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
index 930d57302..70db24523 100644
--- a/examples/multimodal_audio/qwen2_audio_example.py
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -1,3 +1,4 @@
+import torch
 from datasets import load_dataset
 from transformers import AutoProcessor
 
@@ -6,7 +7,6 @@
 from llmcompressor.transformers.tracing import (
     TraceableQwen2AudioForConditionalGeneration,
 )
-from llmcompressor.transformers.utils.data_collator import qwen2_audio_data_collator
 
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -67,14 +67,21 @@ def tokenize(sample):
 
 ds = ds.map(tokenize, remove_columns=ds.column_names)
 
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
 recipe = GPTQModifier(
     targets="Linear",
     scheme="W4A16",
     ignore=[
-        "re:audio_tower.*",
-        "re:multi_modal_projector.*",
+        # "re:audio_tower.*",
+        #"re:multi_modal_projector.*",
         "lm_head",
     ],  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
 )
@@ -86,14 +93,14 @@ def tokenize(sample):
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    data_collator=qwen2_audio_data_collator,
+    data_collator=data_collator,
 )
 
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 breakpoint()
-sample_input = qwen2_audio_data_collator([next(iter(ds))])
+sample_input = data_collator([next(iter(ds))])
 sample_input = {k: v.to(model.device) for k, v in sample_input.items()}
 output = model.generate(**sample_input)
 print(processor.batch_decode(output, skip_special_tokens=True)[0])
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
index 4b652b456..85dce1fd1 100644
--- a/examples/multimodal_audio/whisper_example.py
+++ b/examples/multimodal_audio/whisper_example.py
@@ -5,7 +5,6 @@
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
-from llmcompressor.transformers.utils.data_collator import whisper_data_collator
 
 # Select model and load it.
 MODEL_ID = "openai/whisper-large-v2"
@@ -70,6 +69,13 @@ def process(sample):
 
 ds = ds.map(process, remove_columns=ds.column_names)
 
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
 # Configure the quantization algorithm to run.
 #   * quantize the weights to 4 bit with GPTQ with a group size 128
 recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
@@ -81,7 +87,7 @@ def process(sample):
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    data_collator=whisper_data_collator,
+    data_collator=data_collator,
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/src/llmcompressor/transformers/utils/data_collator.py b/src/llmcompressor/transformers/utils/data_collator.py
index ed6e46803..4108c3e6c 100644
--- a/src/llmcompressor/transformers/utils/data_collator.py
+++ b/src/llmcompressor/transformers/utils/data_collator.py
@@ -57,22 +57,3 @@ def phi3_vision_data_collator(batch):
         "pixel_values": torch.tensor(batch[0]["pixel_values"]),
         "image_sizes": torch.tensor(batch[0]["image_sizes"]),
     }
-
-
-def whisper_data_collator(batch):
-    assert len(batch) == 1
-    return {
-        "input_features": torch.tensor(batch[0]["input_features"]),
-        "decoder_input_ids": torch.tensor(batch[0]["decoder_input_ids"]),
-        "attention_mask": torch.tensor(batch[0]["attention_mask"]),
-    }
-
-
-def qwen2_audio_data_collator(batch):
-    assert len(batch) == 1
-    return {
-        "input_ids": torch.LongTensor(batch[0]["input_ids"]),
-        "attention_mask": torch.tensor(batch[0]["attention_mask"]),
-        "input_features": torch.tensor(batch[0]["input_features"]),
-        "feature_attention_mask": torch.tensor(batch[0]["feature_attention_mask"]),
-    }

From f1bd1d21a10cf88547de5ed6d8a0163a43177157 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sat, 25 Jan 2025 04:52:59 +0000
Subject: [PATCH 18/21] plug in readme

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index fd7f2f3e3..73790ec12 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ Applying quantization with `llmcompressor`:
 * [Activation quantization to `fp8`](examples/quantization_w8a8_fp8)
 * [Weight only quantization to `int4`](examples/quantization_w4a16)
 * [Quantizing MoE LLMs](examples/quantizing_moe)
+* [Quantizing Multimodal Audio LLMs](examples/multimodal_audio)
 
 ### User Guides
 Deep dives into advanced usage of `llmcompressor`:

From 0cbf97c1c7707902184fc19922b0cf2c52c1b1b0 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Mon, 27 Jan 2025 14:53:36 -0500
Subject: [PATCH 19/21] add readme

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/README.md | 63 ++++++++++++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
index 30404ce4c..85a04ef85 100644
--- a/examples/multimodal_audio/README.md
+++ b/examples/multimodal_audio/README.md
@@ -1 +1,62 @@
-TODO
\ No newline at end of file
+# Quantizing Multimodal Audio Models #
+
+<audio controls>
+    <source src="https://datasets-server.huggingface.co/cached-assets/MLCommons/peoples_speech/--/f10597c5d3d3a63f8b6827701297c3afdf178272/--/clean/test/0/audio/audio.wav?Expires=1738010344&Signature=V6eMq7mQo1~wrkdswghsWaf9aklEQwoqw8FwJUiHAL75K7BcarTepBYcQkFIRi6usgU5J0TlX~wBwIlobAE7GzEXTUI7j5KA1MbFTiLo-nIYiq-WpA70EHW3mGy5HyCm01wKD49ngQDOgHX0-NrvTuXJCkTBhfYBwbQ5QsM8Wv3sbgEyadE~RMEGJLTfQL5fzQp3l1FWMdGuBJHDqSZa1SzTbOJYfmNQjGlfgWpm8Fhf5KWDl1NQSgWaiWRC0evbxt~C9Z8sEYwIEma7tTJafWqc2T9Awn8RdMqNKXnqSZ-mQBBxWVAV9cJbGKsj5JXJJwMPl23AUpzfSale71602g__&Key-Pair-Id=K3EI6M078Z3AC3">
+    Your browser does not support the audio element.
+</audio>
+<em>
+
+``` 
+<|startoftranscript|> <|en|>
+...
+
+<|transcribe|> <|notimestamps|>
+that's where you have a lot of windows in the south no actually that's passive solar
+and passive solar is something that was developed and designed in the 1960s and 70s
+and it was a great thing for what it was at the time but it's not a passive house
+```
+</em>
+
+This directory contains example scripts for quantizing a variety of audio language models using the GPTQ quantization.
+
+## Compressing Your Own Model ##
+To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
+```python3
+model_id = "path/to/your/model"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="auto",
+)
+```
+
+## Customizing GPTQModifier Parameters ##
+The GPTQModifier is the modifier responsible for performing quantization of the model weights. For more information on quantizing with different weight schemes, see the `quantization_` examples in the [examples folder](/examples/).
+
+```python3
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["WhisperEncoderLayer", "WhisperDecoderLayer"],
+        ignore=["lm_head"],
+    )
+]
+```
+
+### Sequential Targets ###
+Sequential targets are the modules which determine the granularity of error propagation and activation offloading when performing forward passes of the model. These are typically the "transformer blocks" of the model, also referred to as "layers" with llm-compressor.
+
+Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
+
+### Ignore ###
+If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
+
+## Tracing Errors ##
+Because the architectures of vision-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
+
+## Adding Your Own Smoothquant Mappings ##
+For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
+
+## Adding Your Own Data Collator ##
+Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
\ No newline at end of file

From 8c40a65e0053df41968c6dcff110e20992649f16 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 28 Jan 2025 15:57:15 -0500
Subject: [PATCH 20/21] gibberish is produced, even when the model is exactly
 copied

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 .../multimodal_audio/qwen2_audio_example.py   | 277 +++++++++++++-----
 examples/multimodal_vision/janus_example.py   |  76 +++++
 .../modifiers/quantization/gptq/base.py       |   2 +
 .../pipelines/sequential/helpers.py           |   4 +
 .../finetune/data/peoples_speech.py           |  31 ++
 .../transformers/tracing/qwen2_audio.py       |  56 ++--
 6 files changed, 358 insertions(+), 88 deletions(-)
 create mode 100644 examples/multimodal_vision/janus_example.py
 create mode 100644 src/llmcompressor/transformers/finetune/data/peoples_speech.py

diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py
index 70db24523..ff3c9be25 100644
--- a/examples/multimodal_audio/qwen2_audio_example.py
+++ b/examples/multimodal_audio/qwen2_audio_example.py
@@ -1,6 +1,9 @@
 import torch
 from datasets import load_dataset
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+import soundfile as sf
+from io import BytesIO
+from urllib.request import urlopen
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
@@ -11,98 +14,238 @@
 # Select model and load it.
 MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 
-model = TraceableQwen2AudioForConditionalGeneration.from_pretrained(
+#model = TraceableQwen2AudioForConditionalGeneration.from_pretrained(
+model = Qwen2AudioForConditionalGeneration.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",
 )
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 
-# Select calibration dataset.
-DATASET_ID = "MLCommons/peoples_speech"
-DATASET_SUBSET = "test"
-DATASET_SPLIT = "test"
-
-# Select number of samples. 512 samples is a good place to start.
-# Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
-
-# Load dataset and preprocess.
-ds = load_dataset(
-    DATASET_ID,
-    DATASET_SUBSET,
-    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
-    trust_remote_code=True,
-)
+# # Select calibration dataset.
+# DATASET_ID = "MLCommons/peoples_speech"
+# DATASET_SUBSET = "test"
+# DATASET_SPLIT = "test"
+
+# # Select number of samples. 512 samples is a good place to start.
+# # Increasing the number of samples can improve accuracy.
+# NUM_CALIBRATION_SAMPLES = 1 #512
+# MAX_SEQUENCE_LENGTH = 2048
+
+# # Load dataset and preprocess.
+# ds = load_dataset(
+#     DATASET_ID,
+#     DATASET_SUBSET,
+#     split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
+#     trust_remote_code=True,
+# )
+
+
+# def preprocess(example):
+#     messages = [
+#         # {"role": "system", "content": "You are a helpful assistant."},
+#         {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
+#         # {"role": "user", "content": [{"type": "text", "text": "What does the person say?"}]},
+#     ]}
+#     ]
+
+#     audio_data = example["audio"]["array"]
+#     sample_rate = example["audio"]["sampling_rate"]
+
+#     # import librosa
+#     # new_sr = processor.feature_extractor.sampling_rate
+#     # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
+#     # sample_rate = new_sr
+
+#     #processor.feature_extractor.sampling_rate
+
+#     # # Create an in-memory buffer
+#     # import io
+#     # buffer = io.BytesIO()
+
+#     # # Write the audio data to the in-memory buffer in WAV format
+#     # sf.write(buffer, audio_data, sample_rate, format='WAV')
+
+#     # import librosa
+#     # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)
+
+#     import librosa
+#     audio_data = librosa.load(
+#         BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), 
+#         sr=processor.feature_extractor.sampling_rate
+#     )[0]
+
+#     return {
+#         "text": processor.apply_chat_template(
+#             messages, add_generation_prompt=True, tokenize=False
+#         ),
+#         #"audios": [example["audio"]["array"]],
+#         "audios": [audio_data],
+#         #"array": example["audio"]["array"],
+#         #"sampling_rate": example["audio"]["sampling_rate"],
+#         "sampling_rate": sample_rate,
+#         #"sampling_rate": processor.feature_extractor.sampling_rate
+#     }
+
+
+# ds = ds.map(preprocess, remove_columns=ds.column_names)
+
+
+# # Tokenize inputs.
+# def tokenize(sample):
+#     return processor(**sample, return_tensors="pt")
+
+# # Process inputs.
+# def process(sample):
+
+#     messages = [
+#         {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]}
+#     ]
+
+#     # import librosa
+#     # new_sr = processor.feature_extractor.sampling_rate
+#     # audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
+#     # sample_rate = new_sr
 
+#     #processor.feature_extractor.sampling_rate
 
-def preprocess(example):
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "audio", "audio": None},
-                {"type": "text", "text": "What does the person say?"},
-            ],
-        },
-    ]
+#     # # Create an in-memory buffer
+#     # import io
+#     # buffer = io.BytesIO()
 
-    return {
-        "text": processor.apply_chat_template(
-            messages, add_generation_prompt=True, tokenize=False
-        ),
-        "audios": [example["audio"]["array"]],
-        "sampling_rate": example["audio"]["sampling_rate"],
-    }
+#     # # Write the audio data to the in-memory buffer in WAV format
+#     # sf.write(buffer, audio_data, sample_rate, format='WAV')
 
+#     # import librosa
+#     # audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)
 
-ds = ds.map(preprocess, remove_columns=ds.column_names)
+#     import librosa
+#     audio_data = librosa.load(
+#         BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), 
+#         sr=processor.feature_extractor.sampling_rate
+#     )[0]
 
+#     return processor(
+#         text=processor.apply_chat_template(
+#             messages, add_generation_prompt=True, tokenize=False
+#         ),
+#         #audio=sample["array"],
+#         audios=[audio_data],
+#         #sampling_rate=sample["sampling_rate"],
+#         #sampling_rate=sample["sampling_rate"],
+#         #add_special_tokens=True,
+#         return_tensors="pt",
+#         padding=True
+#     )
 
-# Tokenize inputs.
-def tokenize(sample):
-    return processor(**sample, return_tensors="pt")
 
 
-ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+#     audio_inputs = processor(
+#         text=sample["text"],
+#         #audio=sample["array"],
+#         audios=sample["audios"],
+#         #sampling_rate=sample["sampling_rate"],
+#         #sampling_rate=sample["sampling_rate"],
+#         #add_special_tokens=True,
+#         return_tensors="pt",
+#         padding=True
+#     )
+#     return audio_inputs
+
+#     text_inputs = processor(
+#         text=sample["text"], add_special_tokens=True, return_tensors="pt"
+#     )
+#     text_inputs["decoder_input_ids"] = text_inputs["input_ids"]
+#     del text_inputs["input_ids"]
+
+#     return dict(**audio_inputs, **text_inputs)
+
+
+# #ds = ds.map(tokenize, remove_columns=ds.column_names)
+# ds = ds.map(process, remove_columns=ds.column_names)
+
+messages = [
+    {"role": "user", "content": [{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}]}
+]
+
+# import librosa
+# new_sr = processor.feature_extractor.sampling_rate
+# audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=new_sr)
+# sample_rate = new_sr
+
+#processor.feature_extractor.sampling_rate
+
+# # Create an in-memory buffer
+# import io
+# buffer = io.BytesIO()
+
+# # Write the audio data to the in-memory buffer in WAV format
+# sf.write(buffer, audio_data, sample_rate, format='WAV')
+
+# import librosa
+# audio_data, sample_rate = librosa.load(buffer, sr=sample_rate)
+
+import librosa
+audio_data = librosa.load(
+    BytesIO(urlopen("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav").read()), 
+    sr=processor.feature_extractor.sampling_rate
+)[0]
+
+text = processor.apply_chat_template(
+    messages, add_generation_prompt=True, tokenize=False
+)
+
+breakpoint()
+sample_input = processor(
+    text=text,
+    #audio=sample["array"],
+    audios=[audio_data],
+    #sampling_rate=sample["sampling_rate"],
+    #sampling_rate=sample["sampling_rate"],
+    #add_special_tokens=True,
+    return_tensors="pt",
+    padding=True
+)
+breakpoint()
 
 
 # Define a oneshot data collator for multimodal inputs.
-def data_collator(batch):
-    assert len(batch) == 1
-    return {key: torch.tensor(value) for key, value in batch[0].items()}
+# def data_collator(batch):
+#     assert len(batch) == 1
+#     return {key: torch.tensor(value) for key, value in batch[0].items()}
 
 
 # Configure the quantization algorithm to run.
-#   * quantize the weights to 4 bit with GPTQ with a group size 128
-recipe = GPTQModifier(
-    targets="Linear",
-    scheme="W4A16",
-    ignore=[
-        # "re:audio_tower.*",
-        #"re:multi_modal_projector.*",
-        "lm_head",
-    ],  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
-)
+# #   * quantize the weights to 4 bit with GPTQ with a group size 128
+# recipe = GPTQModifier(
+#     targets="Linear",
+#     scheme="W4A16",
+#     ignore=[
+#         # "re:audio_tower.*",
+#         # "re:multi_modal_projector.*",
+#         "lm_head",
+#     ],  # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing
+# )
 
 # Apply algorithms.
-oneshot(
-    model=model,
-    dataset=ds,
-    recipe=recipe,
-    max_seq_length=MAX_SEQUENCE_LENGTH,
-    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    data_collator=data_collator,
-)
+# oneshot(
+#     model=model,
+#     dataset=ds,
+#     recipe=recipe,
+#     max_seq_length=MAX_SEQUENCE_LENGTH,
+#     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+#     data_collator=data_collator,
+# )
 
 # Confirm generations of the quantized model look sane.
 print("\n\n")
 print("========== SAMPLE GENERATION ==============")
 breakpoint()
-sample_input = data_collator([next(iter(ds))])
+#sample_input = data_collator([next(iter(ds))])
+#sample_input = ds[0]
 sample_input = {k: v.to(model.device) for k, v in sample_input.items()}
-output = model.generate(**sample_input)
+output = model.generate(**sample_input, max_new_tokens=256)
 print(processor.batch_decode(output, skip_special_tokens=True)[0])
 print("==========================================\n\n")
 # that's where you have a lot of windows in the south no actually that's passive solar
@@ -110,6 +253,6 @@ def data_collator(batch):
 # and it was a great thing for what it was at the time but it's not a passive house
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
-model.save_pretrained(SAVE_DIR, save_compressed=True)
-processor.save_pretrained(SAVE_DIR)
+# SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+# model.save_pretrained(SAVE_DIR, save_compressed=True)
+# processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/janus_example.py b/examples/multimodal_vision/janus_example.py
new file mode 100644
index 000000000..59f800eb1
--- /dev/null
+++ b/examples/multimodal_vision/janus_example.py
@@ -0,0 +1,76 @@
+import requests
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
+
+# Load model.
+model_id = "deepseek-ai/Janus-Pro-7B"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id, device_map="auto", torch_dtype="auto"
+)
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+# Oneshot arguments
+DATASET_ID = "flickr30k"
+DATASET_SPLIT = {"calibration": "test[:512]"}
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
+# Recipe
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["LlamaDecoderLayer"],
+        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+    ),
+]
+
+# Perform oneshot
+oneshot(
+    model=model,
+    tokenizer=model_id,
+    dataset=DATASET_ID,
+    splits=DATASET_SPLIT,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    trust_remote_code_model=True,
+    data_collator=data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Please describe the animal in this image\n"},
+            {"type": "image"},
+        ],
+    },
+]
+prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+image_url = "http://images.cocodataset.org/train2017/000000231895.jpg"
+raw_image = Image.open(requests.get(image_url, stream=True).raw)
+
+inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================")
+
+# Save to disk compressed.
+SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
index 5e8a6b47e..42492e363 100644
--- a/src/llmcompressor/modifiers/quantization/gptq/base.py
+++ b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -251,6 +251,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                 )
             if isinstance(exception, unfixable_errors):
                 raise exception
+            
+            raise exception
 
             warnings.warn("Falling back to layer_sequential pipeline")
             try:
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
index 4945ba01e..743b2b138 100644
--- a/src/llmcompressor/pipelines/sequential/helpers.py
+++ b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -70,11 +70,15 @@ def trace_subgraphs(
     tracer = get_tracer(model, sequential_targets, ignore)
     concrete_args = populate_concrete_args(model, sample_input)
 
+
     # trace
     with (
         calibration_forward_context(model),
         HooksMixin.disable_hooks(),
     ):
+        breakpoint()
+        model(**sample_input, **concrete_args)
+        exit(0)
         graph = GraphModule(
             model,
             tracer.trace(
diff --git a/src/llmcompressor/transformers/finetune/data/peoples_speech.py b/src/llmcompressor/transformers/finetune/data/peoples_speech.py
new file mode 100644
index 000000000..2e00b14a2
--- /dev/null
+++ b/src/llmcompressor/transformers/finetune/data/peoples_speech.py
@@ -0,0 +1,31 @@
+from copy import deepcopy
+from typing import TYPE_CHECKING
+
+from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.transformers import DataTrainingArguments as DataArgs
+
+
+@TextGenerationDataset.register(name="peoples_speech")
+class PeoplesSpeech(TextGenerationDataset):
+    """
+    :param data_args: configuration settings for dataset loading
+    :param split: split from dataset to load, for instance `test` or `train[:5%]`
+    :param processor: processor or tokenizer to use on dataset
+    """
+
+    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
+        data_args = deepcopy(data_args)
+        data_args.dataset = "MLCommons/peoples_speech"
+        data_args.dataset_config_name = "test"
+
+        super().__init__(data_args=data_args, split=split, processor=processor)
+
+    def dataset_template(self, example):
+        return {
+            "audio": example["audio"]["array"],
+            "sampling_rate": example["audio"]["sampling_rate"],
+            "text": " " + example["text"].capitalize(),
+        }
diff --git a/src/llmcompressor/transformers/tracing/qwen2_audio.py b/src/llmcompressor/transformers/tracing/qwen2_audio.py
index 06dc1ac8e..ee89e44ef 100644
--- a/src/llmcompressor/transformers/tracing/qwen2_audio.py
+++ b/src/llmcompressor/transformers/tracing/qwen2_audio.py
@@ -1,4 +1,3 @@
-# flake8: noqa
 # coding=utf-8
 # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
 #
@@ -13,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# vllm-project: no copyright
 """PyTorch Qwen2Audio model."""
 
 import math
@@ -225,7 +223,6 @@ class Qwen2AudioFlashAttention2(Qwen2AudioAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -859,6 +856,9 @@ def __init__(self, config: Qwen2AudioConfig):
         self.multi_modal_projector = Qwen2AudioMultiModalProjector(config)
         self.vocab_size = config.text_config.vocab_size
         self.language_model = AutoModelForCausalLM.from_config(config.text_config)
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
+
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
         self.post_init()
@@ -897,18 +897,6 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.language_model.get_decoder()
 
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-
     def _merge_input_ids_with_audio_features(
         self, audio_features, num_audio_tokens, inputs_embeds, input_ids, attention_mask, labels
     ):
@@ -1092,9 +1080,7 @@ def _merge_input_ids_with_audio_features(
 
         audio_to_overwrite &= val
 
-        # TRACING
-        #if audio_to_overwrite.sum() != num_audio_tokens.sum():
-        if False:
+        if audio_to_overwrite.sum() != num_audio_tokens.sum():
             raise ValueError(
                 f"The input provided to the model are wrong. The number of audio tokens is {num_special_audio_tokens} while"
                 f" the number of audio given to the model is {num_audios}. This prevents correct indexing and breaks batch generation."
@@ -1202,9 +1188,34 @@ def forward(
                 selected_audio_feature = audio_outputs.last_hidden_state
                 audio_features = self.multi_modal_projector(selected_audio_feature)
 
-                inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
-                    audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
-                )
+                # if we have consecutive audio tokens, then it means we expanded input_ids in processing
+                audio_tokens = input_ids == self.config.audio_token_index
+                legacy_processing = (audio_tokens[:, :-1] & audio_tokens[:, 1:]).sum() == 0
+
+                if legacy_processing:
+                    logger.warning_once(
+                        "Expanding inputs for audio tokens in Qwen2Audio should be done in processing."
+                    )
+                    inputs_embeds, attention_mask, labels, position_ids, _ = self._merge_input_ids_with_audio_features(
+                        audio_features, audio_output_lengths, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                else:
+                    num_audios, max_audio_tokens, embed_dim = audio_features.shape
+                    audio_features_mask = torch.arange(max_audio_tokens, device=audio_output_lengths.device)[None, :]
+                    audio_features_mask = audio_features_mask < audio_output_lengths[:, None]
+                    audio_features = audio_features[audio_features_mask]
+
+                    n_audio_tokens = (input_ids == self.config.audio_token_index).sum().item()
+                    n_audio_features = audio_features.shape[0]
+
+                    if n_audio_tokens != n_audio_features:
+                        raise ValueError(
+                            f"Audio features and audio tokens do not match: tokens: {n_audio_tokens}, features {n_audio_features}"
+                        )
+                    special_audio_mask = (input_ids == self.config.audio_token_index).to(inputs_embeds.device)
+                    special_audio_mask = special_audio_mask.unsqueeze(-1).expand_as(inputs_embeds)
+                    audio_features = audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, audio_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -1368,3 +1379,6 @@ def _update_model_kwargs_for_generation(
 
     def _reorder_cache(self, *args, **kwargs):
         return self.language_model._reorder_cache(*args, **kwargs)
+
+
+__all__ = ["Qwen2AudioForConditionalGeneration", "Qwen2AudioPreTrainedModel", "Qwen2AudioEncoder"]
\ No newline at end of file

From 6b775efb31fd4fecf34b24fc49ba581c5b238721 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 28 Jan 2025 21:02:57 +0000
Subject: [PATCH 21/21] update readme

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
---
 examples/multimodal_audio/README.md | 40 ++++++++++++++++++++++++-----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
index 85a04ef85..507789490 100644
--- a/examples/multimodal_audio/README.md
+++ b/examples/multimodal_audio/README.md
@@ -1,10 +1,8 @@
 # Quantizing Multimodal Audio Models #
 
-<audio controls>
-    <source src="https://datasets-server.huggingface.co/cached-assets/MLCommons/peoples_speech/--/f10597c5d3d3a63f8b6827701297c3afdf178272/--/clean/test/0/audio/audio.wav?Expires=1738010344&Signature=V6eMq7mQo1~wrkdswghsWaf9aklEQwoqw8FwJUiHAL75K7BcarTepBYcQkFIRi6usgU5J0TlX~wBwIlobAE7GzEXTUI7j5KA1MbFTiLo-nIYiq-WpA70EHW3mGy5HyCm01wKD49ngQDOgHX0-NrvTuXJCkTBhfYBwbQ5QsM8Wv3sbgEyadE~RMEGJLTfQL5fzQp3l1FWMdGuBJHDqSZa1SzTbOJYfmNQjGlfgWpm8Fhf5KWDl1NQSgWaiWRC0evbxt~C9Z8sEYwIEma7tTJafWqc2T9Awn8RdMqNKXnqSZ-mQBBxWVAV9cJbGKsj5JXJJwMPl23AUpzfSale71602g__&Key-Pair-Id=K3EI6M078Z3AC3">
-    Your browser does not support the audio element.
-</audio>
-<em>
+https://github.com/user-attachments/assets/6732c60b-1ebe-4bed-b409-c16c4415dff5
+
+Audio provided by Daniel Galvez et al. under creative commons license
 
 ``` 
 <|startoftranscript|> <|en|>
@@ -53,10 +51,38 @@ Choosing sequential targets with higher granularity (for example "Linear" instea
 If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
 
 ## Tracing Errors ##
-Because the architectures of vision-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
+Because the architectures of audio-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
 
 ## Adding Your Own Smoothquant Mappings ##
 For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
 
 ## Adding Your Own Data Collator ##
-Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
\ No newline at end of file
+Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
+
+## Sample Audio Provided Under a Creative Commons Attribution License ##
+https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{DBLP:journals/corr/abs-2111-09344,
+  author    = {Daniel Galvez and
+               Greg Diamos and
+               Juan Ciro and
+               Juan Felipe Cer{\'{o}}n and
+               Keith Achorn and
+               Anjali Gopi and
+               David Kanter and
+               Maximilian Lam and
+               Mark Mazumder and
+               Vijay Janapa Reddi},
+  title     = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition
+               Dataset for Commercial Usage},
+  journal   = {CoRR},
+  volume    = {abs/2111.09344},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2111.09344},
+  eprinttype = {arXiv},
+  eprint    = {2111.09344},
+  timestamp = {Mon, 22 Nov 2021 16:44:07 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
\ No newline at end of file