diff --git a/examples/multimodal_audio/qwen2_audio_example.py b/examples/multimodal_audio/qwen2_audio_example.py index 930d57302..70db24523 100644 --- a/examples/multimodal_audio/qwen2_audio_example.py +++ b/examples/multimodal_audio/qwen2_audio_example.py @@ -1,3 +1,4 @@ +import torch from datasets import load_dataset from transformers import AutoProcessor @@ -6,7 +7,6 @@ from llmcompressor.transformers.tracing import ( TraceableQwen2AudioForConditionalGeneration, ) -from llmcompressor.transformers.utils.data_collator import qwen2_audio_data_collator # Select model and load it. MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" @@ -67,14 +67,21 @@ def tokenize(sample): ds = ds.map(tokenize, remove_columns=ds.column_names) + +# Define a oneshot data collator for multimodal inputs. +def data_collator(batch): + assert len(batch) == 1 + return {key: torch.tensor(value) for key, value in batch[0].items()} + + # Configure the quantization algorithm to run. # * quantize the weights to 4 bit with GPTQ with a group size 128 recipe = GPTQModifier( targets="Linear", scheme="W4A16", ignore=[ - "re:audio_tower.*", - "re:multi_modal_projector.*", + # "re:audio_tower.*", + #"re:multi_modal_projector.*", "lm_head", ], # TODO: honestly, there's a decent number of parameters in the audio tower worth quantizing ) @@ -86,14 +93,14 @@ def tokenize(sample): recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - data_collator=qwen2_audio_data_collator, + data_collator=data_collator, ) # Confirm generations of the quantized model look sane. print("\n\n") print("========== SAMPLE GENERATION ==============") breakpoint() -sample_input = qwen2_audio_data_collator([next(iter(ds))]) +sample_input = data_collator([next(iter(ds))]) sample_input = {k: v.to(model.device) for k, v in sample_input.items()} output = model.generate(**sample_input) print(processor.batch_decode(output, skip_special_tokens=True)[0]) diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py index 4b652b456..85dce1fd1 100644 --- a/examples/multimodal_audio/whisper_example.py +++ b/examples/multimodal_audio/whisper_example.py @@ -5,7 +5,6 @@ from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.transformers import oneshot from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration -from llmcompressor.transformers.utils.data_collator import whisper_data_collator # Select model and load it. MODEL_ID = "openai/whisper-large-v2" @@ -70,6 +69,13 @@ def process(sample): ds = ds.map(process, remove_columns=ds.column_names) + +# Define a oneshot data collator for multimodal inputs. +def data_collator(batch): + assert len(batch) == 1 + return {key: torch.tensor(value) for key, value in batch[0].items()} + + # Configure the quantization algorithm to run. # * quantize the weights to 4 bit with GPTQ with a group size 128 recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) @@ -81,7 +87,7 @@ def process(sample): recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, - data_collator=whisper_data_collator, + data_collator=data_collator, ) # Confirm generations of the quantized model look sane. diff --git a/src/llmcompressor/transformers/utils/data_collator.py b/src/llmcompressor/transformers/utils/data_collator.py index ed6e46803..4108c3e6c 100644 --- a/src/llmcompressor/transformers/utils/data_collator.py +++ b/src/llmcompressor/transformers/utils/data_collator.py @@ -57,22 +57,3 @@ def phi3_vision_data_collator(batch): "pixel_values": torch.tensor(batch[0]["pixel_values"]), "image_sizes": torch.tensor(batch[0]["image_sizes"]), } - - -def whisper_data_collator(batch): - assert len(batch) == 1 - return { - "input_features": torch.tensor(batch[0]["input_features"]), - "decoder_input_ids": torch.tensor(batch[0]["decoder_input_ids"]), - "attention_mask": torch.tensor(batch[0]["attention_mask"]), - } - - -def qwen2_audio_data_collator(batch): - assert len(batch) == 1 - return { - "input_ids": torch.LongTensor(batch[0]["input_ids"]), - "attention_mask": torch.tensor(batch[0]["attention_mask"]), - "input_features": torch.tensor(batch[0]["input_features"]), - "feature_attention_mask": torch.tensor(batch[0]["feature_attention_mask"]), - }