diff --git a/.MAINTAINERS b/.MAINTAINERS index 1a6355130..edc7f543c 100644 --- a/.MAINTAINERS +++ b/.MAINTAINERS @@ -2,11 +2,11 @@ # uncommented maintainers will be included in code review triage markurtz -bfineran -rahul-tuli -dbogunowicz dsikka -Satrat +rahul-tuli +horheynm +brian-dellabetta +kylesayrs # mgoin # anmarques diff --git a/NOTICE b/NOTICE new file mode 100644 index 000000000..f9c4e8178 --- /dev/null +++ b/NOTICE @@ -0,0 +1,72 @@ +LLM Compressor + +This product includes software developed in association with the vLLM Project (https://github.com/vllm-project). + +Source code in this repository is variously licensed under the Apache License +Version 2.0, an Apache-compatible license. + +* For a copy of the Apache License Version 2.0, please see [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). + +* For a copy of all other Apache-compatible licenses and notices, + they will be listed below. + +======================================================================== +NOTICES +======================================================================== + +Package dependencies are defined in the Python setup.py file in this repository's top-level directory and have their own Apache-compatible licenses and terms. + +Hugging Face Transformers License https://github.com/huggingface/transformers/blob/master/LICENSE + +Some model implementations subclass and include code snippets from Hugging Face Transformers. +These snippets include and are subject to the Hugging Face Copyright and are +provided under the Apache License, Version 2.0 https://github.com/huggingface/transformers/blob/master/LICENSE + +PyTorch License https://github.com/pytorch/pytorch/blob/master/LICENSE + +Sample images are provided under a Creative Commons Attribution License +https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{cocodataset, + author = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick}, + title = {Microsoft {COCO:} Common Objects in Context}, + journal = {CoRR}, + volume = {abs/1405.0312}, + year = {2014}, + url = {http://arxiv.org/abs/1405.0312}, + archivePrefix = {arXiv}, + eprint = {1405.0312}, + timestamp = {Mon, 13 Aug 2018 16:48:13 +0200}, + biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + +Sample audio is provided under a Creative Commons Attribution License https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{DBLP:journals/corr/abs-2111-09344, + author = {Daniel Galvez and + Greg Diamos and + Juan Ciro and + Juan Felipe Cer{\'{o}}n and + Keith Achorn and + Anjali Gopi and + David Kanter and + Maximilian Lam and + Mark Mazumder and + Vijay Janapa Reddi}, + title = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition + Dataset for Commercial Usage}, + journal = {CoRR}, + volume = {abs/2111.09344}, + year = {2021}, + url = {https://arxiv.org/abs/2111.09344}, + eprinttype = {arXiv}, + eprint = {2111.09344}, + timestamp = {Mon, 22 Nov 2021 16:44:07 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + +Other external dependencies, if referenced in this repository's various subdirectories, are subject to their associated licenses and terms. \ No newline at end of file diff --git a/README.md b/README.md index 9ba3caae3..9021c6193 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ Applying quantization with `llmcompressor`: * [Activation quantization to `fp8`](examples/quantization_w8a8_fp8) * [Weight only quantization to `int4`](examples/quantization_w4a16) * [Quantizing MoE LLMs](examples/quantizing_moe) -* [Quantizing Multimodal VLMs](examples/multimodal_vision) +* [Quantizing Vision-Language Models](examples/multimodal_vision) +* [Quantizing Audio-Language Models](examples/multimodal_audio) ### User Guides Deep dives into advanced usage of `llmcompressor`: diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md new file mode 100644 index 000000000..507789490 --- /dev/null +++ b/examples/multimodal_audio/README.md @@ -0,0 +1,88 @@ +# Quantizing Multimodal Audio Models # + +https://github.com/user-attachments/assets/6732c60b-1ebe-4bed-b409-c16c4415dff5 + +Audio provided by Daniel Galvez et al. under creative commons license + +``` +<|startoftranscript|> <|en|> +... + +<|transcribe|> <|notimestamps|> +that's where you have a lot of windows in the south no actually that's passive solar +and passive solar is something that was developed and designed in the 1960s and 70s +and it was a great thing for what it was at the time but it's not a passive house +``` + + +This directory contains example scripts for quantizing a variety of audio language models using the GPTQ quantization. + +## Compressing Your Own Model ## +To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub. +```python3 +model_id = "path/to/your/model" +model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + torch_dtype="auto", +) +``` + +## Customizing GPTQModifier Parameters ## +The GPTQModifier is the modifier responsible for performing quantization of the model weights. For more information on quantizing with different weight schemes, see the `quantization_` examples in the [examples folder](/examples/). + +```python3 +recipe = [ + GPTQModifier( + targets="Linear", + scheme="W4A16", + sequential_targets=["WhisperEncoderLayer", "WhisperDecoderLayer"], + ignore=["lm_head"], + ) +] +``` + +### Sequential Targets ### +Sequential targets are the modules which determine the granularity of error propagation and activation offloading when performing forward passes of the model. These are typically the "transformer blocks" of the model, also referred to as "layers" with llm-compressor. + +Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations. + +### Ignore ### +If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations. + +## Tracing Errors ## +Because the architectures of audio-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md). + +## Adding Your Own Smoothquant Mappings ## +For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md). + +## Adding Your Own Data Collator ## +Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements. + +## Sample Audio Provided Under a Creative Commons Attribution License ## +https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{DBLP:journals/corr/abs-2111-09344, + author = {Daniel Galvez and + Greg Diamos and + Juan Ciro and + Juan Felipe Cer{\'{o}}n and + Keith Achorn and + Anjali Gopi and + David Kanter and + Maximilian Lam and + Mark Mazumder and + Vijay Janapa Reddi}, + title = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition + Dataset for Commercial Usage}, + journal = {CoRR}, + volume = {abs/2111.09344}, + year = {2021}, + url = {https://arxiv.org/abs/2111.09344}, + eprinttype = {arXiv}, + eprint = {2111.09344}, + timestamp = {Mon, 22 Nov 2021 16:44:07 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` \ No newline at end of file diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py new file mode 100644 index 000000000..303c9e935 --- /dev/null +++ b/examples/multimodal_audio/whisper_example.py @@ -0,0 +1,116 @@ +import torch +from datasets import load_dataset +from transformers import WhisperProcessor + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +from llmcompressor.transformers import oneshot +from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration + +# Select model and load it. +MODEL_ID = "openai/whisper-large-v2" + +model = TraceableWhisperForConditionalGeneration.from_pretrained( + MODEL_ID, + device_map="auto", + torch_dtype="auto", +) +model.config.forced_decoder_ids = None +processor = WhisperProcessor.from_pretrained(MODEL_ID) + +# Configure processor the dataset task. +processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") + +# Select calibration dataset. +DATASET_ID = "MLCommons/peoples_speech" +DATASET_SUBSET = "test" +DATASET_SPLIT = "test" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load dataset and preprocess. +ds = load_dataset( + DATASET_ID, + DATASET_SUBSET, + split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]", + trust_remote_code=True, +) + + +def preprocess(example): + return { + "array": example["audio"]["array"], + "sampling_rate": example["audio"]["sampling_rate"], + "text": " " + example["text"].capitalize(), + } + + +ds = ds.map(preprocess, remove_columns=ds.column_names) + + +# Process inputs. +def process(sample): + audio_inputs = processor( + audio=sample["array"], + sampling_rate=sample["sampling_rate"], + return_tensors="pt", + ) + + text_inputs = processor( + text=sample["text"], add_special_tokens=True, return_tensors="pt" + ) + text_inputs["decoder_input_ids"] = text_inputs["input_ids"] + del text_inputs["input_ids"] + + return dict(**audio_inputs, **text_inputs) + + +ds = ds.map(process, remove_columns=ds.column_names) + + +# Define a oneshot data collator for multimodal inputs. +def data_collator(batch): + assert len(batch) == 1 + return {key: torch.tensor(value) for key, value in batch[0].items()} + + +# Recipe +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]), +] + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + data_collator=data_collator, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +sample_features = next(iter(ds))["input_features"] +sample_decoder_ids = [processor.tokenizer.prefix_tokens] +sample_input = { + "input_features": torch.tensor(sample_features).to(model.device), + "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device), +} + +output = model.generate(**sample_input, language="en") +print(processor.batch_decode(output, skip_special_tokens=True)) +print("==========================================\n\n") +# that's where you have a lot of windows in the south no actually that's passive solar +# and passive solar is something that was developed and designed in the 1960s and 70s +# and it was a great thing for what it was at the time but it's not a passive house + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md index 69f31ffb0..2f5ba83c1 100644 --- a/examples/multimodal_vision/README.md +++ b/examples/multimodal_vision/README.md @@ -61,4 +61,22 @@ Because the architectures of vision-language models is often times more complex For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md). ## Adding Your Own Data Collator ## -Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements. \ No newline at end of file +Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements. + +## Sample Image Provided Under a Creative Commons Attribution License ## +https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{cocodataset, + author = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick}, + title = {Microsoft {COCO:} Common Objects in Context}, + journal = {CoRR}, + volume = {abs/1405.0312}, + year = {2014}, + url = {http://arxiv.org/abs/1405.0312}, + archivePrefix = {arXiv}, + eprint = {1405.0312}, + timestamp = {Mon, 13 Aug 2018 16:48:13 +0200}, + biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` \ No newline at end of file diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py index ebb18df12..891819bc6 100644 --- a/examples/multimodal_vision/pixtral_example.py +++ b/examples/multimodal_vision/pixtral_example.py @@ -16,18 +16,20 @@ # Oneshot arguments DATASET_ID = "flickr30k" -DATASET_SPLIT = {"calibration": "test[:512]"} NUM_CALIBRATION_SAMPLES = 512 +DATASET_SPLIT = {"calibration": f"test[:{NUM_CALIBRATION_SAMPLES}]"} MAX_SEQUENCE_LENGTH = 2048 # Define a oneshot data collator for multimodal inputs. +# NOTE: for transformers<4.48.0, please squeeze the first dimension of `pixel_values` +# by appending `[0]` to the end of line 32 def data_collator(batch): assert len(batch) == 1 return { "input_ids": torch.LongTensor(batch[0]["input_ids"]), "attention_mask": torch.tensor(batch[0]["attention_mask"]), - "pixel_values": torch.tensor(batch[0]["pixel_values"])[0], + "pixel_values": torch.tensor(batch[0]["pixel_values"]), } diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py new file mode 100644 index 000000000..df18b0d11 --- /dev/null +++ b/examples/quantization_w8a8_fp8/whisper_example.py @@ -0,0 +1,46 @@ +from datasets import load_dataset +from transformers import AutoProcessor, WhisperForConditionalGeneration + +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.transformers import oneshot + +MODEL_ID = "openai/whisper-large-v2" + +# Load model. +model = WhisperForConditionalGeneration.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) +model.config.forced_decoder_ids = None +processor = AutoProcessor.from_pretrained(MODEL_ID) +processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") + +# Configure the quantization algorithm and scheme. +# In this case, we: +# * quantize the weights to fp8 with per channel via ptq +# * quantize the activations to fp8 with dynamic per token +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] +) + +# Apply quantization. +oneshot(model=model, recipe=recipe) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]" +) +sample = ds[0]["audio"] +input_features = processor( + sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" +).input_features +input_features = input_features.to(model.device) +predicted_ids = model.generate(input_features, language="en", forced_decoder_ids=None) +print(processor.batch_decode(predicted_ids, skip_special_tokens=False)[0]) +# Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py index 6277e9643..5b2be2c65 100644 --- a/src/llmcompressor/modifiers/quantization/cache.py +++ b/src/llmcompressor/modifiers/quantization/cache.py @@ -78,11 +78,11 @@ def update( """ if len(self.k_observers) <= layer_idx: - k_observer_name = self.quantization_args.get_observer() + k_observer_name = self.quantization_args.observer k_observer = Observer.load_from_registry( k_observer_name, quantization_args=self.quantization_args ) - v_observer_name = self.quantization_args.get_observer() + v_observer_name = self.quantization_args.observer v_observer = Observer.load_from_registry( v_observer_name, quantization_args=self.quantization_args ) diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index ee4ce171e..300507644 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -52,7 +52,7 @@ def initialize_observer( quantization_args = getattr(quantization_scheme, arg_name, None) # dont need observers for dynamic if quantization_args and not quantization_args.dynamic: - observer = quantization_args.get_observer() + observer = quantization_args.observer observer = Observer.load_from_registry( observer, quantization_args=quantization_args ) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 5e8a6b47e..65e1c90e0 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -16,7 +16,7 @@ from llmcompressor.core import State from llmcompressor.modifiers import Modifier, ModifierFactory from llmcompressor.modifiers.quantization.calibration import freeze_module_quantization -from llmcompressor.modifiers.quantization.gptq.utils.gptq_quantize import ( +from llmcompressor.modifiers.quantization.gptq.gptq_quantize import ( accumulate_hessian, make_empty_hessian, quantize_weight, @@ -36,7 +36,9 @@ class GPTQModifier(Modifier, HooksMixin): """ - Modifier for applying the one-shot OBCQ algorithm to a model + Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier + uses activations to calibrate a hessian matrix, which is then used to determine + optimal quantizion values and orderings for the model weights. | Sample yaml: | test_stage: @@ -247,7 +249,8 @@ def on_initialize(self, state: State, **kwargs) -> bool: warnings.warn( f"Failed to trace {model_name} with inputs {input_names}. For more " "information on tracing with the sequential pipeline, see " - "`src/llmcompressor/transformers/tracing/GUIDE.md`" + "https://github.com/vllm-project/llm-compressor/blob/main/" + "src/llmcompressor/transformers/tracing/GUIDE.md" ) if isinstance(exception, unfixable_errors): raise exception diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py similarity index 100% rename from src/llmcompressor/modifiers/quantization/gptq/utils/gptq_quantize.py rename to src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py b/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py deleted file mode 100644 index ec39da973..000000000 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# flake8: noqa - -from .gptq_quantize import * diff --git a/src/llmcompressor/modifiers/smoothquant/utils.py b/src/llmcompressor/modifiers/smoothquant/utils.py index 83c82704e..a2a597215 100644 --- a/src/llmcompressor/modifiers/smoothquant/utils.py +++ b/src/llmcompressor/modifiers/smoothquant/utils.py @@ -1,5 +1,4 @@ import functools -import pathlib from collections import namedtuple from typing import Dict, List, Tuple, Union @@ -53,6 +52,16 @@ smooth_layers="re:.*post_attention_layernorm", ), ] +WHISPER_V2_SMOOTHQUANT_MAPPINGS: List[LayerMap] = [ + LayerMap( + balance_layers=["re:.*k_proj", "re:.*v_proj", "re:.*q_proj"], + smooth_layers="re:.*self_attn_layer_norm", + ), + LayerMap( + balance_layers=["re:.*fc1"], + smooth_layers="re:.*final_layer_norm", + ), +] # Registry of layer mappings for different architectures @@ -65,6 +74,7 @@ "BloomForCausalLM": BLOOM_SMOOTHQUANT_MAPPINGS, "ChatGLMForConditionalGeneration": BLOOM_SMOOTHQUANT_MAPPINGS, "Phi3VForCausalLM": PHI3_VISION_SMOOTHQUANT_MAPPINGS, + "WhisperForConditionalGeneration": WHISPER_V2_SMOOTHQUANT_MAPPINGS, } @@ -94,7 +104,10 @@ def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except Exception as original_exception: - readme_location = pathlib.Path(__file__).parent / "README.md" + readme_location = ( + "https://github.com/vllm-project/llm-compressor/tree/main/" + "src/llmcompressor/modifiers/smoothquant" + ) raise RuntimeError( f"Error resolving mappings for given architecture." f"Please refer to the README at {readme_location} for more information." diff --git a/src/llmcompressor/modifiers/utils/hooks.py b/src/llmcompressor/modifiers/utils/hooks.py index bb1755519..386d58cac 100644 --- a/src/llmcompressor/modifiers/utils/hooks.py +++ b/src/llmcompressor/modifiers/utils/hooks.py @@ -1,6 +1,6 @@ import contextlib from functools import wraps -from typing import Any, Callable, ClassVar, List, Union +from typing import Any, Callable, ClassVar, Optional, Set, Union import torch from loguru import logger @@ -30,7 +30,7 @@ class HooksMixin(BaseModel): """ _HOOKS_DISABLED: ClassVar[bool] = False # attached to global HooksMixin - _hooks: List[RemovableHandle] = [] # attached to local subclasses + _hooks: Set[RemovableHandle] = set() # attached to local subclasses @classmethod @contextlib.contextmanager @@ -70,14 +70,22 @@ def wrapped_hook(*args, **kwargs): register_function = getattr(target, f"register_{hook_type}_hook") handle = register_function(wrapped_hook, **kwargs) - self._hooks.append(handle) + self._hooks.add(handle) logger.debug(f"{self} added {handle}") return handle - def remove_hooks(self): - """Remove all hooks belonging to a modifier""" - for hook in self._hooks: + def remove_hooks(self, handles: Optional[Set[RemovableHandle]] = None): + """ + Removes hooks registered by this modifier + + :param handles: optional list of handles to remove, defaults to all hooks + registerd by this modifier + """ + if handles is None: + handles = self._hooks + + for hook in handles: hook.remove() - self._hooks = [] + self._hooks -= handles diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py index a9ecb67a7..5ddc7ebd5 100644 --- a/src/llmcompressor/pytorch/model_load/helpers.py +++ b/src/llmcompressor/pytorch/model_load/helpers.py @@ -8,13 +8,11 @@ from torch.nn import Module from llmcompressor.core import active_session, create_session, pre_initialize_structure -from llmcompressor.pytorch.utils import ModuleSparsificationInfo from llmcompressor.typing import Processor COMPLETED_STAGES_FILENAME = "completed_stages.json" __all__ = [ - "log_model_load", "initialize_recipe", "save_model_and_recipe", "copy_python_files_from_model_cache", @@ -26,45 +24,6 @@ ] -def log_model_load( - model: Module, model_name_or_path: str, model_type: str, delayed_load: bool -): - """ - Log the state of a loaded model including sparsity and - prunable params information. - - :param model: the loaded model - :param model_name_or_path: the original name of or path to the model that loaded - :param model_type: specify the type of model loaded for logging; - ex one of [model, student, teacher] - :param delayed_load: True if this model load was delayed until after - recipe instantiation due to QAT or other architectural state changes - """ - if delayed_load: - logger.info( - f"Delayed load of model {model_name_or_path} detected. " - f"Will print out model information once LLMCompressor recipes have loaded" - ) - return - - sparsification_info = ModuleSparsificationInfo(model) - - logger.info( - f"Loaded {model_type} from {model_name_or_path} " - f"with {sparsification_info.params_total} total params. " - f"Of those there are {sparsification_info.params_prunable_total} prunable " - f"params which have {sparsification_info.params_prunable_sparse_percent} " - "avg sparsity." - ) - model_type = ( - "sparse" if sparsification_info.params_prunable_sparse_percent > 5 else "dense" - ) - logger.info( - f"{model_type} model detected, " - f"all sparsification info: {sparsification_info}" - ) - - def initialize_recipe(model: Module, recipe_path: str): """ Initializes a recipe that has been previously applied to the model diff --git a/src/llmcompressor/transformers/compression/sparsity_config.py b/src/llmcompressor/transformers/compression/sparsity_config.py index eb4b5f18c..d35ddadd1 100644 --- a/src/llmcompressor/transformers/compression/sparsity_config.py +++ b/src/llmcompressor/transformers/compression/sparsity_config.py @@ -130,6 +130,11 @@ def from_pretrained( sparsity_threshold=SparsityConfigMetadata.SPARSITY_THRESHOLD, ) + if not (targets or ignores): + # no sparsity config + # needed if targets/ignores are empty + return None + return SparsityCompressionConfig.load_from_registry( format, global_sparsity=global_sparsity, @@ -181,7 +186,11 @@ def is_sparse24_bitmask_supported( return False if not is_model_quantized(model): - # non-quantized 2:4 sparse models are supported + logger.warning( + "Compressed Sparse-only 2:4 models are not supported in vLLM<=0.7.0, " + "consider saving with `disable_sparse_compression` set, " + "`model.save_pretrained(..., disable_sparse_compression=True)`" + ) return True # when model is quantized, and has 2:4 sparsity diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py index d8df42c93..39410a1ef 100644 --- a/src/llmcompressor/transformers/tracing/__init__.py +++ b/src/llmcompressor/transformers/tracing/__init__.py @@ -10,10 +10,14 @@ from .idefics3 import ( Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration ) +from .whisper import ( + WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration +) __all__ = [ "TraceableLlavaForConditionalGeneration", "TraceableMllamaForConditionalGeneration", "TraceableQwen2VLForConditionalGeneration", - "TraceableIdefics3ForConditionalGeneration" + "TraceableIdefics3ForConditionalGeneration", + "TraceableWhisperForConditionalGeneration", ] diff --git a/src/llmcompressor/transformers/tracing/whisper.py b/src/llmcompressor/transformers/tracing/whisper.py new file mode 100644 index 000000000..6e245760c --- /dev/null +++ b/src/llmcompressor/transformers/tracing/whisper.py @@ -0,0 +1,152 @@ +# flake8: noqa +# coding=utf-8 +# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# vllm-project: no copyright +"""PyTorch Whisper model.""" + +import torch +from torch import nn + +from transformers import WhisperConfig +from transformers.models.whisper.modeling_whisper import ( + WhisperEncoder, + WhisperDecoder, + WhisperModel, + WhisperForConditionalGeneration, + WhisperForAudioClassification, +) +from transformers.modeling_outputs import BaseModelOutput + + +class WhisperEncoder(WhisperEncoder): + def forward( + self, + input_features, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0] + # TRACING: assume preprocessing is correct + # if input_features.shape[-1] != expected_seq_length: + if False: + raise ValueError( + f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + None, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + None, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class WhisperModel(WhisperModel): + def __init__(self, config: WhisperConfig): + super().__init__(config) + + self.encoder = WhisperEncoder(config) + self.decoder = WhisperDecoder(config) + # Initialize weights and apply final processing + self.post_init() + +class WhisperForConditionalGeneration(WhisperForConditionalGeneration): + def __init__(self, config: WhisperConfig): + super().__init__(config) + self.model = WhisperModel(config) + self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False) + self.max_target_positions = config.max_target_positions + + # Initialize weights and apply final processing + self.post_init() + + +class WhisperForAudioClassification(WhisperForAudioClassification): + def __init__(self, config): + super().__init__(config) + + self.encoder = WhisperEncoder(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) + self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() \ No newline at end of file diff --git a/src/llmcompressor/utils/fsdp/context.py b/src/llmcompressor/utils/fsdp/context.py index 177b2c02f..8cc062c19 100644 --- a/src/llmcompressor/utils/fsdp/context.py +++ b/src/llmcompressor/utils/fsdp/context.py @@ -1,10 +1,13 @@ try: from accelerate import Accelerator +except ImportError: + Accelerator = None + +try: from torch.distributed.fsdp import FullyShardedDataParallel - from torch.distributed.fsdp._common_utils import TrainingState + from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE, TrainingState except ImportError: FullyShardedDataParallel = None - Accelerator = None from contextlib import nullcontext @@ -14,8 +17,6 @@ "fix_fsdp_module_name", ] -FSDP_WRAPPER_NAME = "_fsdp_wrapped_module" - def summon_full_params_context(model, offload_to_cpu: bool = False): if FullyShardedDataParallel is not None: @@ -46,12 +47,15 @@ def main_process_first_context(): def fix_fsdp_module_name(name: str) -> str: """ Remove FSDP wrapper prefixes from a module name. - Accounts for scenario where FSDP_WRAPPER_NAME is + Accounts for scenario where FSDP_WRAPPED_MODULE is at the end of the name, as well as in the middle. :param name: name to strip :return: stripped name """ - return name.replace(FSDP_WRAPPER_NAME + ".", "").replace( - "." + FSDP_WRAPPER_NAME, "" + if FullyShardedDataParallel is None: + return name + + return name.replace(FSDP_WRAPPED_MODULE + ".", "").replace( + "." + FSDP_WRAPPED_MODULE, "" ) diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py index e6bf7b319..ad4d884b2 100644 --- a/src/llmcompressor/utils/helpers.py +++ b/src/llmcompressor/utils/helpers.py @@ -1091,9 +1091,11 @@ def DisableQuantization(model: torch.nn.Module): """ Disable quantization from QuantizationModifier """ - model.apply(disable_quantization) - yield - model.apply(enable_quantization) + try: + model.apply(disable_quantization) + yield + finally: + model.apply(enable_quantization) @contextlib.contextmanager diff --git a/tests/llmcompressor/modifiers/calibration/test_cache.py b/tests/llmcompressor/modifiers/calibration/test_cache.py index 6ea024037..898c342f5 100644 --- a/tests/llmcompressor/modifiers/calibration/test_cache.py +++ b/tests/llmcompressor/modifiers/calibration/test_cache.py @@ -28,7 +28,7 @@ def test_is_quantized_cache_singleton(): args = QuantizationArgs() cache = QuantizedKVParameterCache(args) - observer = args.get_observer() + observer = args.observer observer = Observer.load_from_registry(observer, quantization_args=args) tensor = torch.tensor([1, 2, 3]) diff --git a/tests/llmcompressor/modifiers/smoothquant/test_utils.py b/tests/llmcompressor/modifiers/smoothquant/test_utils.py index 95be6bd30..457b64cdb 100644 --- a/tests/llmcompressor/modifiers/smoothquant/test_utils.py +++ b/tests/llmcompressor/modifiers/smoothquant/test_utils.py @@ -12,7 +12,10 @@ @pytest.mark.unit def test_handle_mapping_resolution_errors(): - README_LOCATION = "llmcompressor/modifiers/smoothquant/README.md" + README_LOCATION = ( + "https://github.com/vllm-project/llm-compressor/tree/main/" + "src/llmcompressor/modifiers/smoothquant" + ) @handle_mapping_resolution_errors def func_that_raises_exception(): diff --git a/tests/llmcompressor/modifiers/utils/test_hooks.py b/tests/llmcompressor/modifiers/utils/test_hooks.py index 5c4fc5891..df1eafedb 100644 --- a/tests/llmcompressor/modifiers/utils/test_hooks.py +++ b/tests/llmcompressor/modifiers/utils/test_hooks.py @@ -64,6 +64,27 @@ def test_remove_hooks(): assert mod_a.hook_called and not mod_b.hook_called +def test_remove_hooks_parameterized(): + model = DummyModel() + + mod_a = ModA() + mod_a_pre_hook = mod_a.register_hook(model.linear1, mod_a.hook, "forward_pre") + mod_a_post_hook = mod_a.register_hook(model.linear1, mod_a.hook, "forward") + + mod_b = ModB() + mod_b_pre_hook = mod_b.register_hook(model.linear2, mod_b.hook, "forward_pre") + mod_b_post_hook = mod_b.register_hook(model.linear2, mod_b.hook, "forward") + + mod_a.remove_hooks(set([mod_a_post_hook])) + mod_b.remove_hooks(set([mod_b_pre_hook])) + + assert len(mod_a._hooks) == 1 and next(iter(mod_a._hooks)) == mod_a_pre_hook + assert len(mod_b._hooks) == 1 and next(iter(mod_b._hooks)) == mod_b_post_hook + + model(model.dummy_inputs) + assert mod_a.hook_called and mod_b.hook_called + + def test_disable_hooks(): model = DummyModel() diff --git a/tests/llmcompressor/observers/test_min_max.py b/tests/llmcompressor/observers/test_min_max.py index f23a06dba..b592579f6 100644 --- a/tests/llmcompressor/observers/test_min_max.py +++ b/tests/llmcompressor/observers/test_min_max.py @@ -37,7 +37,7 @@ def test_min_max_observer(symmetric, expected_scale, expected_zero_point): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor) @@ -52,7 +52,7 @@ def test_min_max_observer_symmetric_scale_range(): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor) @@ -80,7 +80,7 @@ def test_min_max_observer_value_update(): tensor = inp num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) curr_max = 1 curr_min = 1 @@ -107,7 +107,7 @@ def test_g_idx(): weights = QuantizationArgs(num_bits=8, group_size=group_size) g_idx = make_dummy_g_idx(tensor.shape[1], group_size) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale_g_idx, zero_point_g_idx = observer(tensor, g_idx=g_idx) diff --git a/tests/llmcompressor/observers/test_mse.py b/tests/llmcompressor/observers/test_mse.py index ec2ecf1b5..4447813b3 100644 --- a/tests/llmcompressor/observers/test_mse.py +++ b/tests/llmcompressor/observers/test_mse.py @@ -32,7 +32,7 @@ def test_mse_observer(symmetric, expected_scale, expected_zero_point): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric, observer="mse") - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor) @@ -48,7 +48,7 @@ def test_mse_observer_symmetric_scale_range(): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor)