Merge remote-tracking branch 'origin' into kylesayrs/hooks-mixin-keep

vllm-project · Jan 30, 2025 · 922ea62 · 922ea62
2 parents 5070615 + 999d660
commit 922ea62
Show file tree

Hide file tree

Showing 23 changed files with 560 additions and 77 deletions.
diff --git a/.MAINTAINERS b/.MAINTAINERS
@@ -2,11 +2,11 @@
 # uncommented maintainers will be included in code review triage
 
 markurtz
-bfineran
-rahul-tuli
-dbogunowicz
 dsikka
-Satrat
+rahul-tuli
+horheynm
+brian-dellabetta
+kylesayrs
 
 # mgoin
 # anmarques

diff --git a/NOTICE b/NOTICE
@@ -0,0 +1,72 @@
+LLM Compressor
+
+This product includes software developed in association with the vLLM Project (https://github.com/vllm-project).
+
+Source code in this repository is variously licensed under the Apache License
+Version 2.0, an Apache-compatible license.
+
+* For a copy of the Apache License Version 2.0, please see [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
+
+* For a copy of all other Apache-compatible licenses and notices,
+  they will be listed below.
+
+========================================================================
+NOTICES
+========================================================================
+
+Package dependencies are defined in the Python setup.py file in this repository's top-level directory and have their own Apache-compatible licenses and terms.
+
+Hugging Face Transformers License https://github.com/huggingface/transformers/blob/master/LICENSE
+
+Some model implementations subclass and include code snippets from Hugging Face Transformers.
+These snippets include and are subject to the Hugging Face Copyright and are
+provided under the Apache License, Version 2.0 https://github.com/huggingface/transformers/blob/master/LICENSE
+
+PyTorch License https://github.com/pytorch/pytorch/blob/master/LICENSE
+
+Sample images are provided under a Creative Commons Attribution License
+https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{cocodataset,
+  author    = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick},
+  title     = {Microsoft {COCO:} Common Objects in Context},
+  journal   = {CoRR},
+  volume    = {abs/1405.0312},
+  year      = {2014},
+  url       = {http://arxiv.org/abs/1405.0312},
+  archivePrefix = {arXiv},
+  eprint    = {1405.0312},
+  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+Sample audio is provided under a Creative Commons Attribution License https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{DBLP:journals/corr/abs-2111-09344,
+  author    = {Daniel Galvez and
+               Greg Diamos and
+               Juan Ciro and
+               Juan Felipe Cer{\'{o}}n and
+               Keith Achorn and
+               Anjali Gopi and
+               David Kanter and
+               Maximilian Lam and
+               Mark Mazumder and
+               Vijay Janapa Reddi},
+  title     = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition
+               Dataset for Commercial Usage},
+  journal   = {CoRR},
+  volume    = {abs/2111.09344},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2111.09344},
+  eprinttype = {arXiv},
+  eprint    = {2111.09344},
+  timestamp = {Mon, 22 Nov 2021 16:44:07 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+Other external dependencies, if referenced in this repository's various subdirectories, are subject to their associated licenses and terms.
diff --git a/README.md b/README.md
@@ -39,7 +39,8 @@ Applying quantization with `llmcompressor`:
 * [Activation quantization to `fp8`](examples/quantization_w8a8_fp8)
 * [Weight only quantization to `int4`](examples/quantization_w4a16)
 * [Quantizing MoE LLMs](examples/quantizing_moe)
-* [Quantizing Multimodal VLMs](examples/multimodal_vision)
+* [Quantizing Vision-Language Models](examples/multimodal_vision)
+* [Quantizing Audio-Language Models](examples/multimodal_audio)
 
 ### User Guides
 Deep dives into advanced usage of `llmcompressor`:

diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md
@@ -0,0 +1,88 @@
+# Quantizing Multimodal Audio Models #
+
+https://github.com/user-attachments/assets/6732c60b-1ebe-4bed-b409-c16c4415dff5
+
+Audio provided by Daniel Galvez et al. under creative commons license
+
+``` 
+<|startoftranscript|> <|en|>
+...
+
+<|transcribe|> <|notimestamps|>
+that's where you have a lot of windows in the south no actually that's passive solar
+and passive solar is something that was developed and designed in the 1960s and 70s
+and it was a great thing for what it was at the time but it's not a passive house
+```
+</em>
+
+This directory contains example scripts for quantizing a variety of audio language models using the GPTQ quantization.
+
+## Compressing Your Own Model ##
+To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub.
+```python3
+model_id = "path/to/your/model"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="auto",
+)
+```
+
+## Customizing GPTQModifier Parameters ##
+The GPTQModifier is the modifier responsible for performing quantization of the model weights. For more information on quantizing with different weight schemes, see the `quantization_` examples in the [examples folder](/examples/).
+
+```python3
+recipe = [
+    GPTQModifier(
+        targets="Linear",
+        scheme="W4A16",
+        sequential_targets=["WhisperEncoderLayer", "WhisperDecoderLayer"],
+        ignore=["lm_head"],
+    )
+]
+```
+
+### Sequential Targets ###
+Sequential targets are the modules which determine the granularity of error propagation and activation offloading when performing forward passes of the model. These are typically the "transformer blocks" of the model, also referred to as "layers" with llm-compressor.
+
+Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
+
+### Ignore ###
+If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
+
+## Tracing Errors ##
+Because the architectures of audio-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
+
+## Adding Your Own Smoothquant Mappings ##
+For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
+
+## Adding Your Own Data Collator ##
+Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
+
+## Sample Audio Provided Under a Creative Commons Attribution License ##
+https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{DBLP:journals/corr/abs-2111-09344,
+  author    = {Daniel Galvez and
+               Greg Diamos and
+               Juan Ciro and
+               Juan Felipe Cer{\'{o}}n and
+               Keith Achorn and
+               Anjali Gopi and
+               David Kanter and
+               Maximilian Lam and
+               Mark Mazumder and
+               Vijay Janapa Reddi},
+  title     = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition
+               Dataset for Commercial Usage},
+  journal   = {CoRR},
+  volume    = {abs/2111.09344},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2111.09344},
+  eprinttype = {arXiv},
+  eprint    = {2111.09344},
+  timestamp = {Mon, 22 Nov 2021 16:44:07 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
@@ -0,0 +1,116 @@
+import torch
+from datasets import load_dataset
+from transformers import WhisperProcessor
+
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.transformers import oneshot
+from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
+
+# Select model and load it.
+MODEL_ID = "openai/whisper-large-v2"
+
+model = TraceableWhisperForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+model.config.forced_decoder_ids = None
+processor = WhisperProcessor.from_pretrained(MODEL_ID)
+
+# Configure processor the dataset task.
+processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
+
+# Select calibration dataset.
+DATASET_ID = "MLCommons/peoples_speech"
+DATASET_SUBSET = "test"
+DATASET_SPLIT = "test"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(
+    DATASET_ID,
+    DATASET_SUBSET,
+    split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]",
+    trust_remote_code=True,
+)
+
+
+def preprocess(example):
+    return {
+        "array": example["audio"]["array"],
+        "sampling_rate": example["audio"]["sampling_rate"],
+        "text": " " + example["text"].capitalize(),
+    }
+
+
+ds = ds.map(preprocess, remove_columns=ds.column_names)
+
+
+# Process inputs.
+def process(sample):
+    audio_inputs = processor(
+        audio=sample["array"],
+        sampling_rate=sample["sampling_rate"],
+        return_tensors="pt",
+    )
+
+    text_inputs = processor(
+        text=sample["text"], add_special_tokens=True, return_tensors="pt"
+    )
+    text_inputs["decoder_input_ids"] = text_inputs["input_ids"]
+    del text_inputs["input_ids"]
+
+    return dict(**audio_inputs, **text_inputs)
+
+
+ds = ds.map(process, remove_columns=ds.column_names)
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {key: torch.tensor(value) for key, value in batch[0].items()}
+
+
+# Recipe
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]),
+]
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    data_collator=data_collator,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+sample_features = next(iter(ds))["input_features"]
+sample_decoder_ids = [processor.tokenizer.prefix_tokens]
+sample_input = {
+    "input_features": torch.tensor(sample_features).to(model.device),
+    "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device),
+}
+
+output = model.generate(**sample_input, language="en")
+print(processor.batch_decode(output, skip_special_tokens=True))
+print("==========================================\n\n")
+# that's where you have a lot of windows in the south no actually that's passive solar
+# and passive solar is something that was developed and designed in the 1960s and 70s
+# and it was a great thing for what it was at the time but it's not a passive house
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md
@@ -61,4 +61,22 @@ Because the architectures of vision-language models is often times more complex
 For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
 
 ## Adding Your Own Data Collator ##
-Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
+Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements.
+
+## Sample Image Provided Under a Creative Commons Attribution License ##
+https://creativecommons.org/licenses/by/4.0/legalcode
+```
+@article{cocodataset,
+  author    = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick},
+  title     = {Microsoft {COCO:} Common Objects in Context},
+  journal   = {CoRR},
+  volume    = {abs/1405.0312},
+  year      = {2014},
+  url       = {http://arxiv.org/abs/1405.0312},
+  archivePrefix = {arXiv},
+  eprint    = {1405.0312},
+  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -0,0 +1,46 @@
+from datasets import load_dataset
+from transformers import AutoProcessor, WhisperForConditionalGeneration
+
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.transformers import oneshot
+
+MODEL_ID = "openai/whisper-large-v2"
+
+# Load model.
+model = WhisperForConditionalGeneration.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+model.config.forced_decoder_ids = None
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+processor.tokenizer.set_prefix_tokens(language="en", task="transcribe")
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with per channel via ptq
+#   * quantize the activations to fp8 with dynamic per token
+recipe = QuantizationModifier(
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+# Confirm generations of the quantized model look sane.
+print("========== SAMPLE GENERATION ==============")
+ds = load_dataset(
+    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]"
+)
+sample = ds[0]["audio"]
+input_features = processor(
+    sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
+).input_features
+input_features = input_features.to(model.device)
+predicted_ids = model.generate(input_features, language="en", forced_decoder_ids=None)
+print(processor.batch_decode(predicted_ids, skip_special_tokens=False)[0])
+# Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel
+print("==========================================")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py
@@ -78,11 +78,11 @@ def update(
         """
 
         if len(self.k_observers) <= layer_idx:
-            k_observer_name = self.quantization_args.get_observer()
+            k_observer_name = self.quantization_args.observer
             k_observer = Observer.load_from_registry(
                 k_observer_name, quantization_args=self.quantization_args
             )
-            v_observer_name = self.quantization_args.get_observer()
+            v_observer_name = self.quantization_args.observer
             v_observer = Observer.load_from_registry(
                 v_observer_name, quantization_args=self.quantization_args
             )