From 8764291119709f7585028cf87f0500b6b8107d74 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 10:33:48 -0500 Subject: [PATCH 01/10] Remove log_model_load (#1016) ## Purpose ## * Remove unused code that is unlikely to be used in the future, since we're now using default transformers autoclasses to load quantized models ## Changes ## * Remove `log_model_load`, since we now prefer to load as run_compressed ## Testing ## `grep -r 'log_model_load' src/ tests/ examples/` Signed-off-by: Kyle Sayers --- .../pytorch/model_load/helpers.py | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py index a9ecb67a7..5ddc7ebd5 100644 --- a/src/llmcompressor/pytorch/model_load/helpers.py +++ b/src/llmcompressor/pytorch/model_load/helpers.py @@ -8,13 +8,11 @@ from torch.nn import Module from llmcompressor.core import active_session, create_session, pre_initialize_structure -from llmcompressor.pytorch.utils import ModuleSparsificationInfo from llmcompressor.typing import Processor COMPLETED_STAGES_FILENAME = "completed_stages.json" __all__ = [ - "log_model_load", "initialize_recipe", "save_model_and_recipe", "copy_python_files_from_model_cache", @@ -26,45 +24,6 @@ ] -def log_model_load( - model: Module, model_name_or_path: str, model_type: str, delayed_load: bool -): - """ - Log the state of a loaded model including sparsity and - prunable params information. - - :param model: the loaded model - :param model_name_or_path: the original name of or path to the model that loaded - :param model_type: specify the type of model loaded for logging; - ex one of [model, student, teacher] - :param delayed_load: True if this model load was delayed until after - recipe instantiation due to QAT or other architectural state changes - """ - if delayed_load: - logger.info( - f"Delayed load of model {model_name_or_path} detected. " - f"Will print out model information once LLMCompressor recipes have loaded" - ) - return - - sparsification_info = ModuleSparsificationInfo(model) - - logger.info( - f"Loaded {model_type} from {model_name_or_path} " - f"with {sparsification_info.params_total} total params. " - f"Of those there are {sparsification_info.params_prunable_total} prunable " - f"params which have {sparsification_info.params_prunable_sparse_percent} " - "avg sparsity." - ) - model_type = ( - "sparse" if sparsification_info.params_prunable_sparse_percent > 5 else "dense" - ) - logger.info( - f"{model_type} model detected, " - f"all sparsification info: {sparsification_info}" - ) - - def initialize_recipe(model: Module, recipe_path: str): """ Initializes a recipe that has been previously applied to the model From 8de50a21dc7044a5cb1e38a2029f6e8cb605b0d1 Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Wed, 29 Jan 2025 14:46:25 -0600 Subject: [PATCH 02/10] Return empty sparsity config if targets and ignores are empty (#1115) This PR fixes an issue where a sparsity configuration could end up being empty under certain conditions. Specifically, if the global sparsity is greater than 0.05, but no individual layer has a sparsity greater than 0.5, we end up with an empty sparsity config. To address this, we now ensure that an empty sparsity config is not added in such cases --- - To see the specific tasks where the Asana app for GitHub is being used, see below: - https://app.asana.com/0/0/1209272443107638 Signed-off-by: Rahul Tuli Co-authored-by: Dipika Sikka --- .../transformers/compression/sparsity_config.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llmcompressor/transformers/compression/sparsity_config.py b/src/llmcompressor/transformers/compression/sparsity_config.py index e048ac838..d35ddadd1 100644 --- a/src/llmcompressor/transformers/compression/sparsity_config.py +++ b/src/llmcompressor/transformers/compression/sparsity_config.py @@ -130,6 +130,11 @@ def from_pretrained( sparsity_threshold=SparsityConfigMetadata.SPARSITY_THRESHOLD, ) + if not (targets or ignores): + # no sparsity config + # needed if targets/ignores are empty + return None + return SparsityCompressionConfig.load_from_registry( format, global_sparsity=global_sparsity, From e32c8dc1c9ee20023b57eeb00a5aa5274160bdf4 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:08:41 -0500 Subject: [PATCH 03/10] Remove uses of get_observer (#939) ## Purpose ## * Be consistent about how the observer of the quantization arguments is referenced From compressed tensors: ```python3 def get_observer(self): return self.observer ``` ## Postrequisites ## * https://github.com/neuralmagic/compressed-tensors/pull/214 ## Changes ## * Remove all uses of `quantization_args.get_observer()` ## Testing ## ``` grep -r '\.get_observer()' src examples tests ``` Signed-off-by: Kyle Sayers --- src/llmcompressor/modifiers/quantization/cache.py | 4 ++-- src/llmcompressor/modifiers/quantization/calibration.py | 2 +- tests/llmcompressor/modifiers/calibration/test_cache.py | 2 +- tests/llmcompressor/observers/test_min_max.py | 8 ++++---- tests/llmcompressor/observers/test_mse.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py index 6277e9643..5b2be2c65 100644 --- a/src/llmcompressor/modifiers/quantization/cache.py +++ b/src/llmcompressor/modifiers/quantization/cache.py @@ -78,11 +78,11 @@ def update( """ if len(self.k_observers) <= layer_idx: - k_observer_name = self.quantization_args.get_observer() + k_observer_name = self.quantization_args.observer k_observer = Observer.load_from_registry( k_observer_name, quantization_args=self.quantization_args ) - v_observer_name = self.quantization_args.get_observer() + v_observer_name = self.quantization_args.observer v_observer = Observer.load_from_registry( v_observer_name, quantization_args=self.quantization_args ) diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index ee4ce171e..300507644 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -52,7 +52,7 @@ def initialize_observer( quantization_args = getattr(quantization_scheme, arg_name, None) # dont need observers for dynamic if quantization_args and not quantization_args.dynamic: - observer = quantization_args.get_observer() + observer = quantization_args.observer observer = Observer.load_from_registry( observer, quantization_args=quantization_args ) diff --git a/tests/llmcompressor/modifiers/calibration/test_cache.py b/tests/llmcompressor/modifiers/calibration/test_cache.py index 6ea024037..898c342f5 100644 --- a/tests/llmcompressor/modifiers/calibration/test_cache.py +++ b/tests/llmcompressor/modifiers/calibration/test_cache.py @@ -28,7 +28,7 @@ def test_is_quantized_cache_singleton(): args = QuantizationArgs() cache = QuantizedKVParameterCache(args) - observer = args.get_observer() + observer = args.observer observer = Observer.load_from_registry(observer, quantization_args=args) tensor = torch.tensor([1, 2, 3]) diff --git a/tests/llmcompressor/observers/test_min_max.py b/tests/llmcompressor/observers/test_min_max.py index f23a06dba..b592579f6 100644 --- a/tests/llmcompressor/observers/test_min_max.py +++ b/tests/llmcompressor/observers/test_min_max.py @@ -37,7 +37,7 @@ def test_min_max_observer(symmetric, expected_scale, expected_zero_point): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor) @@ -52,7 +52,7 @@ def test_min_max_observer_symmetric_scale_range(): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor) @@ -80,7 +80,7 @@ def test_min_max_observer_value_update(): tensor = inp num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) curr_max = 1 curr_min = 1 @@ -107,7 +107,7 @@ def test_g_idx(): weights = QuantizationArgs(num_bits=8, group_size=group_size) g_idx = make_dummy_g_idx(tensor.shape[1], group_size) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale_g_idx, zero_point_g_idx = observer(tensor, g_idx=g_idx) diff --git a/tests/llmcompressor/observers/test_mse.py b/tests/llmcompressor/observers/test_mse.py index ec2ecf1b5..4447813b3 100644 --- a/tests/llmcompressor/observers/test_mse.py +++ b/tests/llmcompressor/observers/test_mse.py @@ -32,7 +32,7 @@ def test_mse_observer(symmetric, expected_scale, expected_zero_point): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=symmetric, observer="mse") - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor) @@ -48,7 +48,7 @@ def test_mse_observer_symmetric_scale_range(): num_bits = 8 weights = QuantizationArgs(num_bits=num_bits, symmetric=True) - observer = weights.get_observer() + observer = weights.observer observer = Observer.load_from_registry(observer, quantization_args=weights) scale, zero_point = observer(tensor) From 75a3551e1b41734c59ea0e5bbcdffa664982c558 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:09:07 -0500 Subject: [PATCH 04/10] FSDP utils cleanup (#854) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose ## * Code cleanup * Fix bug where FSDP and accelerate imports are coupled, meaning that not having one will cause the other's utils to fail ## Changes ## * Decouple `accelerate` and `fsdp` imports * Use existing `FSDP_WRAPPED_MODULE` constant --------- Signed-off-by: Kyle Sayers Signed-off-by: andy-neuma Signed-off-by: Rahul Tuli Signed-off-by: Domenic Barbuzzi Signed-off-by: Dipika Co-authored-by: Dipika Sikka Co-authored-by: Kyle Sayers Co-authored-by: Kyle Sayers Co-authored-by: Jincheng Miao Co-authored-by: 黄石 Co-authored-by: dhuangnm <74931910+dhuangnm@users.noreply.github.com> Co-authored-by: dhuangnm Co-authored-by: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com> Co-authored-by: andy-neuma Co-authored-by: Rahul Tuli Co-authored-by: Domenic Barbuzzi Co-authored-by: Michael Goin Co-authored-by: George --- src/llmcompressor/utils/fsdp/context.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/llmcompressor/utils/fsdp/context.py b/src/llmcompressor/utils/fsdp/context.py index 177b2c02f..8cc062c19 100644 --- a/src/llmcompressor/utils/fsdp/context.py +++ b/src/llmcompressor/utils/fsdp/context.py @@ -1,10 +1,13 @@ try: from accelerate import Accelerator +except ImportError: + Accelerator = None + +try: from torch.distributed.fsdp import FullyShardedDataParallel - from torch.distributed.fsdp._common_utils import TrainingState + from torch.distributed.fsdp._common_utils import FSDP_WRAPPED_MODULE, TrainingState except ImportError: FullyShardedDataParallel = None - Accelerator = None from contextlib import nullcontext @@ -14,8 +17,6 @@ "fix_fsdp_module_name", ] -FSDP_WRAPPER_NAME = "_fsdp_wrapped_module" - def summon_full_params_context(model, offload_to_cpu: bool = False): if FullyShardedDataParallel is not None: @@ -46,12 +47,15 @@ def main_process_first_context(): def fix_fsdp_module_name(name: str) -> str: """ Remove FSDP wrapper prefixes from a module name. - Accounts for scenario where FSDP_WRAPPER_NAME is + Accounts for scenario where FSDP_WRAPPED_MODULE is at the end of the name, as well as in the middle. :param name: name to strip :return: stripped name """ - return name.replace(FSDP_WRAPPER_NAME + ".", "").replace( - "." + FSDP_WRAPPER_NAME, "" + if FullyShardedDataParallel is None: + return name + + return name.replace(FSDP_WRAPPED_MODULE + ".", "").replace( + "." + FSDP_WRAPPED_MODULE, "" ) From 768be88edfb9de9837efc97cf6bc43940e10a710 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:10:58 -0500 Subject: [PATCH 05/10] Update maintainers, add notice (#1091) ## Purpose ## * Cover legal bases w.r.t. HF code, in a similar way SparseML did ## Changes ## * Add a notice which indicates that some code is under HF copyright * Update maintainers list to reflect team changes --------- Signed-off-by: Kyle Sayers --- .MAINTAINERS | 8 +++--- NOTICE | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 NOTICE diff --git a/.MAINTAINERS b/.MAINTAINERS index 1a6355130..edc7f543c 100644 --- a/.MAINTAINERS +++ b/.MAINTAINERS @@ -2,11 +2,11 @@ # uncommented maintainers will be included in code review triage markurtz -bfineran -rahul-tuli -dbogunowicz dsikka -Satrat +rahul-tuli +horheynm +brian-dellabetta +kylesayrs # mgoin # anmarques diff --git a/NOTICE b/NOTICE new file mode 100644 index 000000000..f9c4e8178 --- /dev/null +++ b/NOTICE @@ -0,0 +1,72 @@ +LLM Compressor + +This product includes software developed in association with the vLLM Project (https://github.com/vllm-project). + +Source code in this repository is variously licensed under the Apache License +Version 2.0, an Apache-compatible license. + +* For a copy of the Apache License Version 2.0, please see [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). + +* For a copy of all other Apache-compatible licenses and notices, + they will be listed below. + +======================================================================== +NOTICES +======================================================================== + +Package dependencies are defined in the Python setup.py file in this repository's top-level directory and have their own Apache-compatible licenses and terms. + +Hugging Face Transformers License https://github.com/huggingface/transformers/blob/master/LICENSE + +Some model implementations subclass and include code snippets from Hugging Face Transformers. +These snippets include and are subject to the Hugging Face Copyright and are +provided under the Apache License, Version 2.0 https://github.com/huggingface/transformers/blob/master/LICENSE + +PyTorch License https://github.com/pytorch/pytorch/blob/master/LICENSE + +Sample images are provided under a Creative Commons Attribution License +https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{cocodataset, + author = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick}, + title = {Microsoft {COCO:} Common Objects in Context}, + journal = {CoRR}, + volume = {abs/1405.0312}, + year = {2014}, + url = {http://arxiv.org/abs/1405.0312}, + archivePrefix = {arXiv}, + eprint = {1405.0312}, + timestamp = {Mon, 13 Aug 2018 16:48:13 +0200}, + biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + +Sample audio is provided under a Creative Commons Attribution License https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{DBLP:journals/corr/abs-2111-09344, + author = {Daniel Galvez and + Greg Diamos and + Juan Ciro and + Juan Felipe Cer{\'{o}}n and + Keith Achorn and + Anjali Gopi and + David Kanter and + Maximilian Lam and + Mark Mazumder and + Vijay Janapa Reddi}, + title = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition + Dataset for Commercial Usage}, + journal = {CoRR}, + volume = {abs/2111.09344}, + year = {2021}, + url = {https://arxiv.org/abs/2111.09344}, + eprinttype = {arXiv}, + eprint = {2111.09344}, + timestamp = {Mon, 22 Nov 2021 16:44:07 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + +Other external dependencies, if referenced in this repository's various subdirectories, are subject to their associated licenses and terms. \ No newline at end of file From a76563ab99c0e0ebf0347846fa580b27e55df74b Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:12:29 -0500 Subject: [PATCH 06/10] Replace readme paths with urls (#1097) ## Purpose ## * Files with the `.md` extension are not listed in the [MANIFEST.in](https://github.com/vllm-project/llm-compressor/blob/main/MANIFEST.in), meaning that they will not be included in the LLM Compressor pypi package. This means that references to these files are left dangling for users who have installed from the pypi package. Rather than including `.md` in the package and having to also ship all the large images files associated with them, this PR moves the references to urls hosted by github * While the github url paths may change between versions, this solution works in lieu of a dedicated readthedoc build for each version * This solution also aligns with the practice of other libraries which point to hosted urls rather than file paths * Note that this does not apply to files which are themselves `.md` files, as these files will not be included in the pypi distribution * `src/llmcompressor/transformers/finetune/README.md` * `src/llmcompressor/pipelines/sequential/README.md` ## Changes ## * Replace readme file paths with urls * Small change to `DisableQuantization` to better catch cases where exceptions such as tracing exceptions are triggered ## Testing ## * N/A --------- Signed-off-by: Kyle Sayers --- src/llmcompressor/modifiers/quantization/gptq/base.py | 3 ++- src/llmcompressor/modifiers/smoothquant/utils.py | 6 ++++-- src/llmcompressor/utils/helpers.py | 8 +++++--- tests/llmcompressor/modifiers/smoothquant/test_utils.py | 5 ++++- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 5e8a6b47e..5e353d0cb 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -247,7 +247,8 @@ def on_initialize(self, state: State, **kwargs) -> bool: warnings.warn( f"Failed to trace {model_name} with inputs {input_names}. For more " "information on tracing with the sequential pipeline, see " - "`src/llmcompressor/transformers/tracing/GUIDE.md`" + "https://github.com/vllm-project/llm-compressor/blob/main/" + "src/llmcompressor/transformers/tracing/GUIDE.md" ) if isinstance(exception, unfixable_errors): raise exception diff --git a/src/llmcompressor/modifiers/smoothquant/utils.py b/src/llmcompressor/modifiers/smoothquant/utils.py index 83c82704e..a3af344e6 100644 --- a/src/llmcompressor/modifiers/smoothquant/utils.py +++ b/src/llmcompressor/modifiers/smoothquant/utils.py @@ -1,5 +1,4 @@ import functools -import pathlib from collections import namedtuple from typing import Dict, List, Tuple, Union @@ -94,7 +93,10 @@ def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except Exception as original_exception: - readme_location = pathlib.Path(__file__).parent / "README.md" + readme_location = ( + "https://github.com/vllm-project/llm-compressor/tree/main/" + "src/llmcompressor/modifiers/smoothquant" + ) raise RuntimeError( f"Error resolving mappings for given architecture." f"Please refer to the README at {readme_location} for more information." diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py index e6bf7b319..ad4d884b2 100644 --- a/src/llmcompressor/utils/helpers.py +++ b/src/llmcompressor/utils/helpers.py @@ -1091,9 +1091,11 @@ def DisableQuantization(model: torch.nn.Module): """ Disable quantization from QuantizationModifier """ - model.apply(disable_quantization) - yield - model.apply(enable_quantization) + try: + model.apply(disable_quantization) + yield + finally: + model.apply(enable_quantization) @contextlib.contextmanager diff --git a/tests/llmcompressor/modifiers/smoothquant/test_utils.py b/tests/llmcompressor/modifiers/smoothquant/test_utils.py index 95be6bd30..457b64cdb 100644 --- a/tests/llmcompressor/modifiers/smoothquant/test_utils.py +++ b/tests/llmcompressor/modifiers/smoothquant/test_utils.py @@ -12,7 +12,10 @@ @pytest.mark.unit def test_handle_mapping_resolution_errors(): - README_LOCATION = "llmcompressor/modifiers/smoothquant/README.md" + README_LOCATION = ( + "https://github.com/vllm-project/llm-compressor/tree/main/" + "src/llmcompressor/modifiers/smoothquant" + ) @handle_mapping_resolution_errors def func_that_raises_exception(): From ba8563c58fda59168da02368505e8dafeff75b88 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:12:51 -0500 Subject: [PATCH 07/10] GPTQ add Arkiv link, move file location (#1100) ## Purpose ## * Better docstring for GPTQ * Reduce unnecessary file hierarchy Signed-off-by: Kyle Sayers --- src/llmcompressor/modifiers/quantization/gptq/base.py | 6 ++++-- .../quantization/gptq/{utils => }/gptq_quantize.py | 0 .../modifiers/quantization/gptq/utils/__init__.py | 3 --- 3 files changed, 4 insertions(+), 5 deletions(-) rename src/llmcompressor/modifiers/quantization/gptq/{utils => }/gptq_quantize.py (100%) delete mode 100644 src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py index 5e353d0cb..65e1c90e0 100644 --- a/src/llmcompressor/modifiers/quantization/gptq/base.py +++ b/src/llmcompressor/modifiers/quantization/gptq/base.py @@ -16,7 +16,7 @@ from llmcompressor.core import State from llmcompressor.modifiers import Modifier, ModifierFactory from llmcompressor.modifiers.quantization.calibration import freeze_module_quantization -from llmcompressor.modifiers.quantization.gptq.utils.gptq_quantize import ( +from llmcompressor.modifiers.quantization.gptq.gptq_quantize import ( accumulate_hessian, make_empty_hessian, quantize_weight, @@ -36,7 +36,9 @@ class GPTQModifier(Modifier, HooksMixin): """ - Modifier for applying the one-shot OBCQ algorithm to a model + Implements the GPTQ algorithm from https://arxiv.org/abs/2210.17323. This modifier + uses activations to calibrate a hessian matrix, which is then used to determine + optimal quantizion values and orderings for the model weights. | Sample yaml: | test_stage: diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_quantize.py b/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py similarity index 100% rename from src/llmcompressor/modifiers/quantization/gptq/utils/gptq_quantize.py rename to src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py b/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py deleted file mode 100644 index ec39da973..000000000 --- a/src/llmcompressor/modifiers/quantization/gptq/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# flake8: noqa - -from .gptq_quantize import * From 507b1a403a1de513f1529f8b7ed191dceaeede0b Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:15:29 -0500 Subject: [PATCH 08/10] Extend `remove_hooks` to remove subsets (#1021) ## Purpose ## * Allow subsets of hooks to be removed * Not strictly needed but helps promote code clarity in the case of wanda which adds and removes subsets of hooks at different times. ## Postrequisites ## * https://github.com/vllm-project/llm-compressor/pull/1023 * Layer compressor deprecation ## Changes ## * Change the datatype of `_hooks` from `List` to `Set` * Add `handles` argument to `HooksMixin.remove_hooks` ## Testing ## * Added `test_remove_hooks_parameterized` test --------- Signed-off-by: Kyle Sayers --- src/llmcompressor/modifiers/utils/hooks.py | 22 +++++++++++++------ .../modifiers/utils/test_hooks.py | 21 ++++++++++++++++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/llmcompressor/modifiers/utils/hooks.py b/src/llmcompressor/modifiers/utils/hooks.py index bb1755519..386d58cac 100644 --- a/src/llmcompressor/modifiers/utils/hooks.py +++ b/src/llmcompressor/modifiers/utils/hooks.py @@ -1,6 +1,6 @@ import contextlib from functools import wraps -from typing import Any, Callable, ClassVar, List, Union +from typing import Any, Callable, ClassVar, Optional, Set, Union import torch from loguru import logger @@ -30,7 +30,7 @@ class HooksMixin(BaseModel): """ _HOOKS_DISABLED: ClassVar[bool] = False # attached to global HooksMixin - _hooks: List[RemovableHandle] = [] # attached to local subclasses + _hooks: Set[RemovableHandle] = set() # attached to local subclasses @classmethod @contextlib.contextmanager @@ -70,14 +70,22 @@ def wrapped_hook(*args, **kwargs): register_function = getattr(target, f"register_{hook_type}_hook") handle = register_function(wrapped_hook, **kwargs) - self._hooks.append(handle) + self._hooks.add(handle) logger.debug(f"{self} added {handle}") return handle - def remove_hooks(self): - """Remove all hooks belonging to a modifier""" - for hook in self._hooks: + def remove_hooks(self, handles: Optional[Set[RemovableHandle]] = None): + """ + Removes hooks registered by this modifier + + :param handles: optional list of handles to remove, defaults to all hooks + registerd by this modifier + """ + if handles is None: + handles = self._hooks + + for hook in handles: hook.remove() - self._hooks = [] + self._hooks -= handles diff --git a/tests/llmcompressor/modifiers/utils/test_hooks.py b/tests/llmcompressor/modifiers/utils/test_hooks.py index 5c4fc5891..df1eafedb 100644 --- a/tests/llmcompressor/modifiers/utils/test_hooks.py +++ b/tests/llmcompressor/modifiers/utils/test_hooks.py @@ -64,6 +64,27 @@ def test_remove_hooks(): assert mod_a.hook_called and not mod_b.hook_called +def test_remove_hooks_parameterized(): + model = DummyModel() + + mod_a = ModA() + mod_a_pre_hook = mod_a.register_hook(model.linear1, mod_a.hook, "forward_pre") + mod_a_post_hook = mod_a.register_hook(model.linear1, mod_a.hook, "forward") + + mod_b = ModB() + mod_b_pre_hook = mod_b.register_hook(model.linear2, mod_b.hook, "forward_pre") + mod_b_post_hook = mod_b.register_hook(model.linear2, mod_b.hook, "forward") + + mod_a.remove_hooks(set([mod_a_post_hook])) + mod_b.remove_hooks(set([mod_b_pre_hook])) + + assert len(mod_a._hooks) == 1 and next(iter(mod_a._hooks)) == mod_a_pre_hook + assert len(mod_b._hooks) == 1 and next(iter(mod_b._hooks)) == mod_b_post_hook + + model(model.dummy_inputs) + assert mod_a.hook_called and mod_b.hook_called + + def test_disable_hooks(): model = DummyModel() From 7fc4a6740dd7bc19285ba0aed79542305ceeb855 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:20:04 -0500 Subject: [PATCH 09/10] [Audio] Whisper Example and Readme (#1106) ## Purpose ## * Show example of quantizing whisper audio model ## Changes ## * Add whisper audio model example * Add traceable whisper definition (only need to comment out a value error check) * The embedded audio is achieved using [github attached files](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/attaching-files). While there's no official word on how long these files are maintained, if it is found that the file is deleted at some point, then we can replace it with a link to the file uploaded to the repo. ## Testing ## Successfully quantized whisper models and generated reasonable sample outputs * https://huggingface.co/nm-testing/whisper-tiny-W4A16-G128 * https://huggingface.co/nm-testing/whisper-large-v2-W4A16-G128 --------- Signed-off-by: Kyle Sayers --- README.md | 3 +- examples/multimodal_audio/README.md | 88 ++++++++++ examples/multimodal_audio/whisper_example.py | 116 +++++++++++++ examples/multimodal_vision/README.md | 20 ++- .../modifiers/smoothquant/utils.py | 11 ++ .../transformers/tracing/__init__.py | 6 +- .../transformers/tracing/whisper.py | 152 ++++++++++++++++++ 7 files changed, 393 insertions(+), 3 deletions(-) create mode 100644 examples/multimodal_audio/README.md create mode 100644 examples/multimodal_audio/whisper_example.py create mode 100644 src/llmcompressor/transformers/tracing/whisper.py diff --git a/README.md b/README.md index 9ba3caae3..9021c6193 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,8 @@ Applying quantization with `llmcompressor`: * [Activation quantization to `fp8`](examples/quantization_w8a8_fp8) * [Weight only quantization to `int4`](examples/quantization_w4a16) * [Quantizing MoE LLMs](examples/quantizing_moe) -* [Quantizing Multimodal VLMs](examples/multimodal_vision) +* [Quantizing Vision-Language Models](examples/multimodal_vision) +* [Quantizing Audio-Language Models](examples/multimodal_audio) ### User Guides Deep dives into advanced usage of `llmcompressor`: diff --git a/examples/multimodal_audio/README.md b/examples/multimodal_audio/README.md new file mode 100644 index 000000000..507789490 --- /dev/null +++ b/examples/multimodal_audio/README.md @@ -0,0 +1,88 @@ +# Quantizing Multimodal Audio Models # + +https://github.com/user-attachments/assets/6732c60b-1ebe-4bed-b409-c16c4415dff5 + +Audio provided by Daniel Galvez et al. under creative commons license + +``` +<|startoftranscript|> <|en|> +... + +<|transcribe|> <|notimestamps|> +that's where you have a lot of windows in the south no actually that's passive solar +and passive solar is something that was developed and designed in the 1960s and 70s +and it was a great thing for what it was at the time but it's not a passive house +``` + + +This directory contains example scripts for quantizing a variety of audio language models using the GPTQ quantization. + +## Compressing Your Own Model ## +To use your own multimodal modal, start with an existing example change the `model_id` to match your own model stub. +```python3 +model_id = "path/to/your/model" +model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + torch_dtype="auto", +) +``` + +## Customizing GPTQModifier Parameters ## +The GPTQModifier is the modifier responsible for performing quantization of the model weights. For more information on quantizing with different weight schemes, see the `quantization_` examples in the [examples folder](/examples/). + +```python3 +recipe = [ + GPTQModifier( + targets="Linear", + scheme="W4A16", + sequential_targets=["WhisperEncoderLayer", "WhisperDecoderLayer"], + ignore=["lm_head"], + ) +] +``` + +### Sequential Targets ### +Sequential targets are the modules which determine the granularity of error propagation and activation offloading when performing forward passes of the model. These are typically the "transformer blocks" of the model, also referred to as "layers" with llm-compressor. + +Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations. + +### Ignore ### +If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations. + +## Tracing Errors ## +Because the architectures of audio-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md). + +## Adding Your Own Smoothquant Mappings ## +For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md). + +## Adding Your Own Data Collator ## +Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements. + +## Sample Audio Provided Under a Creative Commons Attribution License ## +https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{DBLP:journals/corr/abs-2111-09344, + author = {Daniel Galvez and + Greg Diamos and + Juan Ciro and + Juan Felipe Cer{\'{o}}n and + Keith Achorn and + Anjali Gopi and + David Kanter and + Maximilian Lam and + Mark Mazumder and + Vijay Janapa Reddi}, + title = {The People's Speech: {A} Large-Scale Diverse English Speech Recognition + Dataset for Commercial Usage}, + journal = {CoRR}, + volume = {abs/2111.09344}, + year = {2021}, + url = {https://arxiv.org/abs/2111.09344}, + eprinttype = {arXiv}, + eprint = {2111.09344}, + timestamp = {Mon, 22 Nov 2021 16:44:07 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-2111-09344.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` \ No newline at end of file diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py new file mode 100644 index 000000000..303c9e935 --- /dev/null +++ b/examples/multimodal_audio/whisper_example.py @@ -0,0 +1,116 @@ +import torch +from datasets import load_dataset +from transformers import WhisperProcessor + +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier +from llmcompressor.transformers import oneshot +from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration + +# Select model and load it. +MODEL_ID = "openai/whisper-large-v2" + +model = TraceableWhisperForConditionalGeneration.from_pretrained( + MODEL_ID, + device_map="auto", + torch_dtype="auto", +) +model.config.forced_decoder_ids = None +processor = WhisperProcessor.from_pretrained(MODEL_ID) + +# Configure processor the dataset task. +processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") + +# Select calibration dataset. +DATASET_ID = "MLCommons/peoples_speech" +DATASET_SUBSET = "test" +DATASET_SPLIT = "test" + +# Select number of samples. 512 samples is a good place to start. +# Increasing the number of samples can improve accuracy. +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load dataset and preprocess. +ds = load_dataset( + DATASET_ID, + DATASET_SUBSET, + split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]", + trust_remote_code=True, +) + + +def preprocess(example): + return { + "array": example["audio"]["array"], + "sampling_rate": example["audio"]["sampling_rate"], + "text": " " + example["text"].capitalize(), + } + + +ds = ds.map(preprocess, remove_columns=ds.column_names) + + +# Process inputs. +def process(sample): + audio_inputs = processor( + audio=sample["array"], + sampling_rate=sample["sampling_rate"], + return_tensors="pt", + ) + + text_inputs = processor( + text=sample["text"], add_special_tokens=True, return_tensors="pt" + ) + text_inputs["decoder_input_ids"] = text_inputs["input_ids"] + del text_inputs["input_ids"] + + return dict(**audio_inputs, **text_inputs) + + +ds = ds.map(process, remove_columns=ds.column_names) + + +# Define a oneshot data collator for multimodal inputs. +def data_collator(batch): + assert len(batch) == 1 + return {key: torch.tensor(value) for key, value in batch[0].items()} + + +# Recipe +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]), +] + +# Apply algorithms. +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, + data_collator=data_collator, +) + +# Confirm generations of the quantized model look sane. +print("\n\n") +print("========== SAMPLE GENERATION ==============") +sample_features = next(iter(ds))["input_features"] +sample_decoder_ids = [processor.tokenizer.prefix_tokens] +sample_input = { + "input_features": torch.tensor(sample_features).to(model.device), + "decoder_input_ids": torch.tensor(sample_decoder_ids).to(model.device), +} + +output = model.generate(**sample_input, language="en") +print(processor.batch_decode(output, skip_special_tokens=True)) +print("==========================================\n\n") +# that's where you have a lot of windows in the south no actually that's passive solar +# and passive solar is something that was developed and designed in the 1960s and 70s +# and it was a great thing for what it was at the time but it's not a passive house + +# Save to disk compressed. +SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR) diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md index 69f31ffb0..2f5ba83c1 100644 --- a/examples/multimodal_vision/README.md +++ b/examples/multimodal_vision/README.md @@ -61,4 +61,22 @@ Because the architectures of vision-language models is often times more complex For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md). ## Adding Your Own Data Collator ## -Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements. \ No newline at end of file +Most examples utilize a generic `data_collator` which correctly correlates data for most multimodal datasets. If you find that your model needs custom data collation (as is the case with [pixtral](/examples/multimodal_vision/pixtral_example.py)), you can modify this function to reflect these model-specific requirements. + +## Sample Image Provided Under a Creative Commons Attribution License ## +https://creativecommons.org/licenses/by/4.0/legalcode +``` +@article{cocodataset, + author = {Tsung{-}Yi Lin and Michael Maire and Serge J. Belongie and Lubomir D. Bourdev and Ross B. Girshick and James Hays and Pietro Perona and Deva Ramanan and Piotr Doll{'{a} }r and C. Lawrence Zitnick}, + title = {Microsoft {COCO:} Common Objects in Context}, + journal = {CoRR}, + volume = {abs/1405.0312}, + year = {2014}, + url = {http://arxiv.org/abs/1405.0312}, + archivePrefix = {arXiv}, + eprint = {1405.0312}, + timestamp = {Mon, 13 Aug 2018 16:48:13 +0200}, + biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` \ No newline at end of file diff --git a/src/llmcompressor/modifiers/smoothquant/utils.py b/src/llmcompressor/modifiers/smoothquant/utils.py index a3af344e6..a2a597215 100644 --- a/src/llmcompressor/modifiers/smoothquant/utils.py +++ b/src/llmcompressor/modifiers/smoothquant/utils.py @@ -52,6 +52,16 @@ smooth_layers="re:.*post_attention_layernorm", ), ] +WHISPER_V2_SMOOTHQUANT_MAPPINGS: List[LayerMap] = [ + LayerMap( + balance_layers=["re:.*k_proj", "re:.*v_proj", "re:.*q_proj"], + smooth_layers="re:.*self_attn_layer_norm", + ), + LayerMap( + balance_layers=["re:.*fc1"], + smooth_layers="re:.*final_layer_norm", + ), +] # Registry of layer mappings for different architectures @@ -64,6 +74,7 @@ "BloomForCausalLM": BLOOM_SMOOTHQUANT_MAPPINGS, "ChatGLMForConditionalGeneration": BLOOM_SMOOTHQUANT_MAPPINGS, "Phi3VForCausalLM": PHI3_VISION_SMOOTHQUANT_MAPPINGS, + "WhisperForConditionalGeneration": WHISPER_V2_SMOOTHQUANT_MAPPINGS, } diff --git a/src/llmcompressor/transformers/tracing/__init__.py b/src/llmcompressor/transformers/tracing/__init__.py index d8df42c93..39410a1ef 100644 --- a/src/llmcompressor/transformers/tracing/__init__.py +++ b/src/llmcompressor/transformers/tracing/__init__.py @@ -10,10 +10,14 @@ from .idefics3 import ( Idefics3ForConditionalGeneration as TraceableIdefics3ForConditionalGeneration ) +from .whisper import ( + WhisperForConditionalGeneration as TraceableWhisperForConditionalGeneration +) __all__ = [ "TraceableLlavaForConditionalGeneration", "TraceableMllamaForConditionalGeneration", "TraceableQwen2VLForConditionalGeneration", - "TraceableIdefics3ForConditionalGeneration" + "TraceableIdefics3ForConditionalGeneration", + "TraceableWhisperForConditionalGeneration", ] diff --git a/src/llmcompressor/transformers/tracing/whisper.py b/src/llmcompressor/transformers/tracing/whisper.py new file mode 100644 index 000000000..6e245760c --- /dev/null +++ b/src/llmcompressor/transformers/tracing/whisper.py @@ -0,0 +1,152 @@ +# flake8: noqa +# coding=utf-8 +# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# vllm-project: no copyright +"""PyTorch Whisper model.""" + +import torch +from torch import nn + +from transformers import WhisperConfig +from transformers.models.whisper.modeling_whisper import ( + WhisperEncoder, + WhisperDecoder, + WhisperModel, + WhisperForConditionalGeneration, + WhisperForAudioClassification, +) +from transformers.modeling_outputs import BaseModelOutput + + +class WhisperEncoder(WhisperEncoder): + def forward( + self, + input_features, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0] + # TRACING: assume preprocessing is correct + # if input_features.shape[-1] != expected_seq_length: + if False: + raise ValueError( + f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}." + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + embed_pos = self.embed_positions.weight + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + None, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + None, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class WhisperModel(WhisperModel): + def __init__(self, config: WhisperConfig): + super().__init__(config) + + self.encoder = WhisperEncoder(config) + self.decoder = WhisperDecoder(config) + # Initialize weights and apply final processing + self.post_init() + +class WhisperForConditionalGeneration(WhisperForConditionalGeneration): + def __init__(self, config: WhisperConfig): + super().__init__(config) + self.model = WhisperModel(config) + self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False) + self.max_target_positions = config.max_target_positions + + # Initialize weights and apply final processing + self.post_init() + + +class WhisperForAudioClassification(WhisperForAudioClassification): + def __init__(self, config): + super().__init__(config) + + self.encoder = WhisperEncoder(config) + num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings + if config.use_weighted_layer_sum: + self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers) + self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size) + self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() \ No newline at end of file From 999d6600e3112c01f281085d5ebb7b934f1f4c0e Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Wed, 29 Jan 2025 18:20:23 -0500 Subject: [PATCH 10/10] [Audio] Add whisper fp8 dynamic example (#1111) ## Purpose ## * Add example of quantizing multimodal model with FP8 dynamic ## Changes ## * Add whisper FP8 example, collaborated with @mgoin ## Testing ## * Ran example to mention Signed-off-by: Kyle Sayers --- .../quantization_w8a8_fp8/whisper_example.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 examples/quantization_w8a8_fp8/whisper_example.py diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py new file mode 100644 index 000000000..df18b0d11 --- /dev/null +++ b/examples/quantization_w8a8_fp8/whisper_example.py @@ -0,0 +1,46 @@ +from datasets import load_dataset +from transformers import AutoProcessor, WhisperForConditionalGeneration + +from llmcompressor.modifiers.quantization import QuantizationModifier +from llmcompressor.transformers import oneshot + +MODEL_ID = "openai/whisper-large-v2" + +# Load model. +model = WhisperForConditionalGeneration.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) +model.config.forced_decoder_ids = None +processor = AutoProcessor.from_pretrained(MODEL_ID) +processor.tokenizer.set_prefix_tokens(language="en", task="transcribe") + +# Configure the quantization algorithm and scheme. +# In this case, we: +# * quantize the weights to fp8 with per channel via ptq +# * quantize the activations to fp8 with dynamic per token +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] +) + +# Apply quantization. +oneshot(model=model, recipe=recipe) + +# Confirm generations of the quantized model look sane. +print("========== SAMPLE GENERATION ==============") +ds = load_dataset( + "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]" +) +sample = ds[0]["audio"] +input_features = processor( + sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" +).input_features +input_features = input_features.to(model.device) +predicted_ids = model.generate(input_features, language="en", forced_decoder_ids=None) +print(processor.batch_decode(predicted_ids, skip_special_tokens=False)[0]) +# Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel +print("==========================================") + +# Save to disk in compressed-tensors format. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR, save_compressed=True) +processor.save_pretrained(SAVE_DIR)