vllm-project · brian-dellabetta · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/src/llmcompressor/modifiers/awq/__init__.py b/src/llmcompressor/modifiers/awq/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from .base import *
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
diff --git a/src/llmcompressor/modifiers/smoothquant/base.py b/src/llmcompressor/modifiers/smoothquant/base.py
@@ -2,8 +2,9 @@
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
-from compressed_tensors.utils.offload import is_module_offloaded
+from accelerate.utils import align_module_device
 from loguru import logger
+from pydantic import ConfigDict
 from torch.nn import Module
 
 from llmcompressor.core import State
@@ -99,13 +100,16 @@ class SmoothQuantModifier(Modifier):
     to use the default tensor_module_forward
     """
 
+    # Allow arbitrary types because AWQMapping has field of type torch.nn.Module
+    model_config: ConfigDict = ConfigDict(arbitrary_types_allowed=True)
+
     smoothing_strength: float = 0.5
     mappings: Optional[List[Union[Tuple, List]]] = None
     ignore: Optional[List[str]] = None
     num_calibration_steps: Optional[int] = None
     calibration_function: Optional[Callable] = None
 
-    resolved_mappings_: Optional[List] = None
+    resolved_mappings_: Optional[List[SmoothQuantMapping]] = None
     scales_: Optional[Dict] = None
 
     def on_initialize(self, state: State, **kwargs) -> bool:
@@ -166,7 +170,7 @@ def _infer_mappings_from_model(
         )
 
     @handle_mapping_resolution_errors
-    def _resolve_mappings(self, model: Module) -> List:
+    def _resolve_mappings(self, model: Module) -> List[SmoothQuantMapping]:
         """
         Transforms the list of activations to smooth and their corresponding weights
         into SmoothQuantMapping objects, resolving regular expressions.
@@ -289,22 +293,16 @@ def _apply_smoothing(self, model: Module):
 
             @torch.no_grad()
             def smooth(module):
-                offloaded = is_module_offloaded(module)
-                if offloaded:
-                    module._hf_hook.pre_forward(module)
-
-                if module in balance_layers:
-                    module.weight.mul_(scales.view(1, -1))
-                elif module == smooth_layer:
-                    if module.weight.ndim == 1:
-                        module.weight.div_(scales)
-                    else:
-                        module.weight.div_(scales.view(-1, 1))
-                    if hasattr(module, "bias") and module.bias is not None:
-                        module.bias.div_(scales)
-
-                if offloaded:
-                    module._hf_hook.post_forward(module, None)
+                with align_module_device(module):
+                    if module in balance_layers:
+                        module.weight.mul_(scales.view(1, -1))
+                    elif module == smooth_layer:
+                        if module.weight.ndim == 1:
+                            module.weight.div_(scales)
+                        else:
+                            module.weight.div_(scales.view(-1, 1))
+                        if hasattr(module, "bias") and module.bias is not None:
+                            module.bias.div_(scales)
 
             parent = get_fsdp_parent(mapping.smooth_name, model)
             if parent is not None:
@@ -329,15 +327,9 @@ def _calculate_smoothing_scales(
         # get the channel-wise dynamic range for each layer to be balanced
         weight_scales = []
         for layer in balance_layers:
-            offloaded = is_module_offloaded(layer)
-            if offloaded:
-                layer._hf_hook.pre_forward(layer)
-
-            scale = layer.weight.abs().max(dim=0, keepdim=True)[0]
-            weight_scales.append(scale)
-
-            if offloaded:
-                layer._hf_hook.post_forward(layer, None)
+            with align_module_device(layer):
+                scale = layer.weight.abs().max(dim=0, keepdim=True)[0]
+                weight_scales.append(scale)
 
         weight_scales = 2.0 * torch.cat(weight_scales, dim=0).max(dim=0)[0]
 

diff --git a/src/llmcompressor/pytorch/utils/helpers.py b/src/llmcompressor/pytorch/utils/helpers.py
@@ -2,6 +2,8 @@
 Utility / helper functions
 """
 
+import functools
+import inspect
 import os
 import random
 import re
@@ -85,6 +87,10 @@
     "detach",
     "adjust_quantization_for_onnx_export",
     "get_dependency_order",
+    "pseudo_quantize_tensor",
+    "pseudo_dequantize_linear",
+    "tensor_forward_with_input_args",
+    "sanitize_kwargs_for_module",
 ]
 
 
@@ -680,6 +686,43 @@ def mask_difference(old_mask: Tensor, new_mask: Tensor) -> Tensor:
     return -1.0 * newly_masked + newly_unmasked
 
 
+def sanitize_kwargs_for_module(
+    kwargs: Dict[str, Any], module: Module
+) -> Dict[str, Any]:
+    """
+    Sanitize the kwargs for a Module by removing any keys that are not
+    in the signature of the forward method.
+    :param kwargs: the kwargs to sanitize
+    :param module: the Module to sanitize the kwargs for
+    :return: the sanitized kwargs for the callable object
+    """
+    if not isinstance(kwargs, dict):
+        raise TypeError(f"Expected a dictionary as kwargs, but got {kwargs}")
+
+    allowed_params = inspect.signature(module.forward).parameters
+    return {key: value for key, value in kwargs.items() if key in allowed_params}
+
+
+def tensor_forward_with_input_args(
+    module: Module, inputs: Tensor, input_kwargs: Dict[str, Any]
+) -> Tensor:
+    """
+    Forward the given inputs through the given module with the given input_kwargs.
+    This function is a wrapper around tensors_module_forward that ensures that the
+    input_kwargs are sanitized and passed to the module as keyword arguments during
+    the forward pass.
+    :param module: the module to forward the inputs through
+    :param inputs: the inputs to forward through the module
+    :param input_kwargs: the keyword arguments to pass to the
+        module during the forward pass
+    :return: the output of the module after forwarding the inputs through it
+    """
+    inputs = inputs.to(next(module.parameters()).device)
+    input_kwargs = sanitize_kwargs_for_module(input_kwargs, module)
+
+    return tensors_module_forward(inputs, functools.partial(module, **input_kwargs))
+
+
 ##############################
 #
 # pytorch module helper functions
@@ -1194,3 +1237,62 @@ def swap_modules(
     parent.__setattr__(sections[-1], submodule_to_replace)
 
     return cur
+
+
+def pseudo_quantize_tensor(
+    w: torch.Tensor, symmetric: bool = False, bit_width: int = 8, group_size: int = -1
+):
+    org_w_shape = w.shape
+    if group_size > 0:
+        assert org_w_shape[-1] % group_size == 0
+        w = w.reshape(-1, group_size)
+    assert w.dim() == 2
+    assert torch.isnan(w).sum() == 0
+
+    if not symmetric:
+        max_val = w.amax(dim=1, keepdim=True)
+        min_val = w.amin(dim=1, keepdim=True)
+        max_int = 2**bit_width - 1
+        min_int = 0
+        scales = (max_val - min_val).clamp(min=1e-5) / max_int
+        zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
+        w = (
+            torch.clamp(torch.round(w / scales) + zeros, min_int, max_int) - zeros
+        ) * scales
+        zeros = zeros.view(org_w_shape[0], -1)
+    else:
+        max_val = w.abs().amax(dim=1, keepdim=True)
+        max_val = max_val.clamp(min=1e-5)
+        max_int = 2 ** (bit_width - 1) - 1
+        min_int = -(2 ** (bit_width - 1))
+        scales = max_val / max_int
+        zeros = None
+        w = torch.clamp(torch.round(w / scales), min_int, max_int) * scales
+
+    assert torch.isnan(scales).sum() == 0
+    assert torch.isnan(w).sum() == 0
+
+    scales = scales.view(org_w_shape[0], -1)
+    w = w.reshape(org_w_shape)
+
+    return w, scales, zeros
+
+
+def pseudo_dequantize_linear(
+    w: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: Optional[torch.Tensor] = None,
+    symmetric: bool = False,
+):
+    # get repeated count
+    repeat_count = w.weight.data.shape[-1] // scales.shape[-1]
+    scales = scales.repeat(1, repeat_count).reshape(w.weight.data.shape)
+
+    # dequantize
+    if not symmetric:
+        zeros = zeros.repeat(1, repeat_count).reshape(w.weight.data.shape)
+        w = (w.weight.data - zeros) * scales
+    else:
+        w = w.weight.data * scales
+
+    return w
diff --git a/src/llmcompressor/transformers/finetune/data/__init__.py b/src/llmcompressor/transformers/finetune/data/__init__.py
@@ -8,6 +8,7 @@
 from .flickr_30k import Flickr30K
 from .gsm8k import GSM8KDataset
 from .open_platypus import OpenPlatypusDataset
+from .pile import PileEvalDataset
 from .ptb import PtbDataset
 from .ultrachat_200k import UltraChatDataset
 from .wikitext import WikiTextDataset
diff --git a/src/llmcompressor/transformers/finetune/data/pile.py b/src/llmcompressor/transformers/finetune/data/pile.py
@@ -0,0 +1,31 @@
+from copy import deepcopy
+from typing import TYPE_CHECKING
+
+from llmcompressor.transformers.finetune.data import TextGenerationDataset
+from llmcompressor.typing import Processor
+
+if TYPE_CHECKING:
+    from llmcompressor.args import DatasetArguments
+
+
+@TextGenerationDataset.register(name="pile_eval")
+class PileEvalDataset(TextGenerationDataset):
+    """
+    Child text generation class for the PileEval dataset
+    :param data_args: configuration settings for dataset loading
+    :param split: split from dataset to load, for instance `test` or `train[:5%]`
+    :param tokenizer: tokenizer to use on dataset
+    """
+
+    def __init__(self, data_args: "DatasetArguments", split: str, processor: Processor):
+        data_args = deepcopy(data_args)
+        data_args.text_column = "text"
+        data_args.dataset = "mit-han-lab/pile-val-backup"
+        super().__init__(data_args=data_args, split=split, processor=processor)
+
+    def dataset_template(self, sample):
+        return {
+            "text": self.processor.apply_chat_template(
+                sample["text"].strip(),
+            ),
+        }
diff --git a/src/llmcompressor/utils/pytorch/module.py b/src/llmcompressor/utils/pytorch/module.py
@@ -60,6 +60,7 @@
     "get_layers_params",
     "get_matching_layer",
     "get_no_split_params",
+    "get_parent_by_name",
 ]
 
 
@@ -338,3 +339,22 @@ def get_no_split_params(module: Module) -> Union[str, List[str]]:
     if hasattr(model, "_no_split_modules"):
         return model._no_split_modules
     return ALL_TARGET
+
+
+def get_parent_by_name(layer_name: str, model: Module) -> Tuple[str, Module]:
+    """
+    Get the parent layer of a layer by name.
+    :param layer_name: Name of the layer to find the parent of.
+    :param model: Model to search for the parent layer.
+    :return: Tuple containing the name of the parent layer
+        and the parent layer itself.
+    """
+    if not any(layer_name == name for name, _ in model.named_modules()):
+        raise ValueError(f"Layer '{layer_name}' not found in model")
+
+    parent_name_parts = layer_name.split(".")[:-1]
+    if not parent_name_parts:
+        return "", model
+
+    parent_name = ".".join(parent_name_parts)
+    return get_layer(parent_name, model)
diff --git a/tests/llmcompressor/modifiers/awq/__init__.py b/tests/llmcompressor/modifiers/awq/__init__.py
diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py
@@ -0,0 +1,28 @@
+import unittest
+
+import pytest
+
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.factory import ModifierFactory
+from tests.llmcompressor.modifiers.conf import setup_modifier_factory
+
+
+@pytest.mark.unit
+class TestAWQIsRegistered(unittest.TestCase):
+    def setUp(self):
+        self.kwargs = {}
+        setup_modifier_factory()
+
+    def test_awq_is_registered(self):
+        modifier = ModifierFactory.create(
+            type_="AWQModifier",
+            allow_experimental=False,
+            allow_registered=True,
+            **self.kwargs,
+        )
+
+        self.assertIsInstance(
+            modifier,
+            AWQModifier,
+            "PyTorch AWQModifier not registered",
+        )
diff --git a/tests/llmcompressor/pytorch/utils/test_helpers.py b/tests/llmcompressor/pytorch/utils/test_helpers.py
@@ -16,9 +16,11 @@
     get_optim_learning_rate,
     mask_difference,
     memory_aware_threshold,
+    sanitize_kwargs_for_module,
     set_optim_learning_rate,
     tensor_density,
     tensor_export,
+    tensor_forward_with_input_args,
     tensor_sample,
     tensor_sparsity,
     tensors_batch_size,
@@ -855,3 +857,43 @@ def test_memory_aware_threshold(tensor, idx):
 
     if prior_state is not None:
         os.environ[MEMORY_BOUNDED] = prior_state
+
+
+class TestSanitizeKwargsForModule:
+    @pytest.fixture
+    def module(self):
+        return Linear(10, 20)
+
+    def test_sanitize_kwargs_for_module_not_dict(self, module):
+        # Test with kwargs that are not a dictionary
+        with pytest.raises(TypeError):
+            sanitize_kwargs_for_module("not a dictionary", module)
+
+    def test_sanitize_kwargs_for_module_not_in_signature(self, module):
+        # Test with kwargs that are not in the signature of the forward method
+        kwargs = {"not_in_signature": 123}
+        sanitized_kwargs = sanitize_kwargs_for_module(kwargs, module)
+        assert sanitized_kwargs == {}
+
+    def test_sanitize_kwargs_for_module_in_signature(self, module):
+        # Test with kwargs that are in the signature of the forward method
+        kwargs = {"input": torch.randn(1, 10)}
+        sanitized_kwargs = sanitize_kwargs_for_module(kwargs, module)
+        assert sanitized_kwargs == kwargs
+
+
+class TestTensorForwardWithInputArgs:
+    @pytest.fixture
+    def module(self):
+        return Linear(10, 20)
+
+    def test_tensor_forward_with_input_args(self, module):
+        # Test with valid inputs and input_kwargs
+        inputs = torch.randn(1, 10)
+        input_kwargs = {}
+        output = tensor_forward_with_input_args(module, inputs, input_kwargs)
+        assert output.shape == (1, 20)
+
+        # Test with input_kwargs that are not in the signature of the forward method
+        input_kwargs = {"not_in_signature": 123}
+        tensor_forward_with_input_args(module, inputs, input_kwargs)