merge with main

Signed-off-by: Kyle Sayers <[email protected]>
vllm-project · Feb 13, 2025 · 5b384bb · 5b384bb
1 parent 4093df7
commit 5b384bb
Show file tree

Hide file tree

Showing 58 changed files with 986 additions and 340 deletions.
diff --git a/examples/automodelforcausallm/README.md b/examples/automodelforcausallm/README.md
diff --git a/examples/automodelforcausallm/run_automodelforcausallm.py b/examples/automodelforcausallm/run_automodelforcausallm.py
diff --git a/examples/finetuning/example_alternating_recipe.yaml b/examples/finetuning/example_alternating_recipe.yaml
@@ -4,12 +4,10 @@ initial_sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
-      targets: [
-        "re:model.layers.\\d+$"
-      ]
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 initial_training_stage:
   run_type: train
   pruning_modifiers:
@@ -22,12 +20,10 @@ next_sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.7
       block_size: 128
-      sequential_update: False
       percdamp: 0.01
       mask_structure: "0:0"
-      targets: [
-        "re:model.layers.\\d+$"
-      ]
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 next_training_stage:
   run_type: train
   pruning_modifiers:

diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_group-128_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 finetuning_stage:
   run_type: train
   finetuning_modifiers:

diff --git a/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml b/examples/quantization_2of4_sparse_w4a16/2of4_w4a16_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 finetuning_stage:
   run_type: train
   finetuning_modifiers:

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -5,6 +5,10 @@
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
+# NOTE: transformers 4.48.0 has an import error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-V2.5"
 

diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -4,6 +4,10 @@
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.transformers import oneshot
 
+# NOTE: transformers 4.48.0 has an import error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 

diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -6,6 +6,10 @@
 from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
+# NOTE: transformers 4.48.0 has an import error with DeepSeek.
+# Please consider either downgrading your transformers version to a
+# previous version or upgrading to a version where this bug is fixed
+
 # select a Mixture of Experts model for quantization
 MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 

diff --git a/examples/sparse_2of4_quantization_fp8/README.md b/examples/sparse_2of4_quantization_fp8/README.md
@@ -93,7 +93,7 @@ oneshot(
 )
 ```
 
-3. **Save the Compressed Model**
+### Saving the Compressed Model
 
 The compressed model and tokenizer are saved to the output directory:
 
@@ -106,6 +106,17 @@ Output Directories:
 - Without FP8: `Meta-Llama-3-8B-Instruct-2of4-sparse`
 - With FP8: `Meta-Llama-3-8B-Instruct-2of4-W8A8-FP8-Dynamic-Per-Token`
 
+#### Saving Without Sparse Compression
+
+To save the model on disk without sparse compression:
+
+```python
+model.save_pretrained(save_dir, save_compressed=True, disable_sparse_compression=True)
+tokenizer.save_pretrained(save_dir)
+```
+
+> **Note:** Saving a model with both the `save_compressed` and `disable_sparse_compression` options will compress the model using the quantization compressor; however, instead of using the more disk-efficient sparsity compressor(s), the dense sparsity compressor will be used. The `dense` sparsity compressor saves model params as is, and does not leverage sparsity for disk-efficient storage. These options only affect how the model(s) are saved on disk and do not impact the actual pruning or quantization processes.
+
 ### Validation
 
 After compression, the script validates the model by generating a sample output:

diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -115,5 +115,7 @@ def get_recipe(fp8_enabled):
 print("==========================================\n")
 
 # Save compressed model and tokenizer
-model.save_pretrained(save_dir, save_compressed=args.fp8)
+model.save_pretrained(
+    save_dir, save_compressed=args.fp8, disable_sparse_compression=True
+)
 tokenizer.save_pretrained(save_dir)
diff --git a/examples/trl_mixin/ex_trl_constant.py b/examples/trl_mixin/ex_trl_constant.py
@@ -3,7 +3,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from trl import DataCollatorForCompletionOnlyLM
 
-from llmcompressor.transformers import TrainingArguments
+from llmcompressor.args import TrainingArguments
 
 model_path = "neuralmagic/Llama-2-7b-pruned50-retrained"
 output_dir = "./output_trl_sft_test_7b_gsm8k_sft_data"

diff --git a/examples/trl_mixin/ex_trl_distillation.py b/examples/trl_mixin/ex_trl_distillation.py
@@ -1,11 +1,8 @@
 from sft_trainer import SFTTrainer
 from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator
 
-from llmcompressor.transformers import (
-    DataTrainingArguments,
-    TextGenerationDataset,
-    TrainingArguments,
-)
+from llmcompressor.args import DatasetArguments, TrainingArguments
+from llmcompressor.transformers import TextGenerationDataset
 
 model_path = "neuralmagic/Llama-2-7b-pruned50-retrained"
 teacher_path = "neuralmagic/Llama-2-7b-gsm8k"
@@ -21,7 +18,7 @@
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 # Load gsm8k using SparseML dataset tools
-data_args = DataTrainingArguments(
+data_args = DatasetArguments(
     dataset="gsm8k", dataset_config_name="main", max_seq_length=512
 )
 dataset_manager = TextGenerationDataset.load_from_registry(

diff --git a/examples/trl_mixin/sft_trainer.py b/examples/trl_mixin/sft_trainer.py
@@ -1,7 +1,7 @@
 from trl import SFTConfig as TRLSFTConfig
 from trl import SFTTrainer as TRLSFTTrainer
 
-from llmcompressor.transformers import TrainingArguments
+from llmcompressor.args import TrainingArguments
 from llmcompressor.transformers.finetune.session_mixin import SessionManagerMixIn
 
 __all__ = ["SFTTrainer"]

diff --git a/src/llmcompressor/transformers/finetune/README.md b/src/llmcompressor/transformers/finetune/README.md
@@ -74,9 +74,10 @@ train(
 
 Finetuning arguments are split up into 3 groups:
 
-* ModelArguments: `src/llmcompressor/transformers/finetune/model_args.py`
-* TrainingArguments: `src/llmcompressor/transformers/finetune/training_args.py`
-* DataTrainingArguments: `src/llmcompressor/transformers/finetune/data/data_training_args.py`
+* ModelArguments: `src/llmcompressor/transformers/utils/arg_parser/model_arguments.py`
+* TrainingArguments: `src/llmcompressor/transformers/utils/arg_parser/training_arguments.py`
+* DatasetArguments: `src/llmcompressor/transformers/utils/arg_parser/dataset_arguments.py`
+* RecipeArguments: `src/llmcompressor/transformers/utils/arg_parser/recipe_arguments.py`
 
 
 ## Running One-Shot with FSDP

diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_group-128_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:

diff --git a/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml b/tests/e2e/vLLM/recipes/WNA16_2of4/2of4_w4a16_recipe.yaml
@@ -4,7 +4,8 @@ sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       mask_structure: "2:4"
-      sequential_update: false
+      targets: ["Linear"]
+      ignore: ["re:.*lm_head"]
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:

diff --git a/tests/e2e/vLLM/test_vllm.py b/tests/e2e/vLLM/test_vllm.py
@@ -1,12 +1,13 @@
 import os
+import re
 import shutil
 from pathlib import Path
-from typing import Callable
 
 import pytest
 import yaml
 from huggingface_hub import HfApi
 from loguru import logger
+from parameterized import parameterized_class
 
 from llmcompressor.core import active_session
 from tests.e2e.e2e_utils import run_oneshot_for_e2e_testing
@@ -20,19 +21,24 @@
     vllm_installed = False
     logger.warning("vllm is not installed. This test will be skipped")
 
+
 HF_MODEL_HUB_NAME = "nm-testing"
-TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", "")
 
+TEST_DATA_FILE = os.environ.get("TEST_DATA_FILE", "")
+SKIP_HF_UPLOAD = os.environ.get("SKIP_HF_UPLOAD", "")
 
-@pytest.fixture
-def record_config_file(record_testsuite_property: Callable[[str, object], None]):
-    test_data_file_name = TEST_DATA_FILE.split("configs/")[-1]
-    record_testsuite_property("TEST_DATA_FILE_NAME", test_data_file_name)
+EXPECTED_SAVED_FILES = [
+    "config.json",
+    r"^model(?:-\d{5}-of-\d{5})?\.safetensors$",
+    "recipe.yaml",
+    "tokenizer.json",
+]
 
 
 # Will run each test case in its own process through run_tests.sh
 # emulating vLLM CI testing
 @requires_gpu_count(1)
+@parameterized_class("test_data_file", [(TEST_DATA_FILE,)])
 @pytest.mark.skipif(not vllm_installed, reason="vLLM is not installed, skipping test")
 class TestvLLM:
     """
@@ -52,7 +58,9 @@ class TestvLLM:
     """  # noqa: E501
 
     def set_up(self):
-        eval_config = yaml.safe_load(Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
+        eval_config = yaml.safe_load(
+            Path(self.test_data_file).read_text(encoding="utf-8")
+        )
 
         if os.environ.get("CADENCE", "commit") != eval_config.get("cadence"):
             pytest.skip("Skipping test; cadence mismatch")
@@ -65,6 +73,7 @@ def set_up(self):
         self.recipe = eval_config.get("recipe")
         self.quant_type = eval_config.get("quant_type")
         self.save_dir = eval_config.get("save_dir")
+        self.save_compressed = eval_config.get("save_compressed", True)
 
         logger.info("========== RUNNING ==============")
         logger.info(self.scheme)
@@ -79,7 +88,6 @@ def set_up(self):
         ]
         self.api = HfApi()
 
-    @pytest.mark.usefixtures("record_config_file")
     def test_vllm(self):
         # Run vLLM with saved model
         import torch
@@ -100,11 +108,19 @@ def test_vllm(self):
             quant_type=self.quant_type,
         )
 
+        # check that session contains recipe
+        self._check_session_contains_recipe()
+
         logger.info("================= SAVING TO DISK ======================")
-        oneshot_model.save_pretrained(self.save_dir)
+        oneshot_model.save_pretrained(
+            self.save_dir, save_compressed=self.save_compressed
+        )
         tokenizer.save_pretrained(self.save_dir)
         recipe_path = os.path.join(self.save_dir, "recipe.yaml")
 
+        # check that expected files exist
+        self._check_save_dir_has_expected_files()
+
         # Use the session to fetch the recipe;
         # Reset session for next test case
         session = active_session()
@@ -113,12 +129,22 @@ def test_vllm(self):
             fp.write(recipe_yaml_str)
         session.reset()
 
-        logger.info("================= UPLOADING TO HUB ======================")
+        if SKIP_HF_UPLOAD.lower() != "yes":
+            logger.info("================= UPLOADING TO HUB ======================")
 
-        self.api.upload_folder(
-            repo_id=f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e",
-            folder_path=self.save_dir,
-        )
+            stub = f"{HF_MODEL_HUB_NAME}/{self.save_dir}-e2e"
+
+            self.api.create_repo(
+                repo_id=stub,
+                exist_ok=True,
+                repo_type="model",
+                private=False,
+            )
+
+            self.api.upload_folder(
+                repo_id=stub,
+                folder_path=self.save_dir,
+            )
 
         logger.info("================= RUNNING vLLM =========================")
 
@@ -146,3 +172,35 @@ def test_vllm(self):
     def tear_down(self):
         if self.save_dir is not None:
             shutil.rmtree(self.save_dir)
+
+    def _check_session_contains_recipe(self) -> None:
+        session = active_session()
+        recipe_yaml_str = session.get_serialized_recipe()
+        assert recipe_yaml_str is not None
+
+    def _check_save_dir_has_expected_files(self):
+        files = os.listdir(self.save_dir)
+        logger.debug("Saved files: ", files)
+
+        matched_patterns = set()
+
+        for expected in EXPECTED_SAVED_FILES:
+            # Find all files matching the expected pattern
+            matches = [
+                file
+                for file in files
+                if (
+                    re.fullmatch(expected, file)
+                    if expected.startswith("^")
+                    else file == expected
+                )
+            ]
+            if len(matches) > 0:
+                matched_patterns.add(expected)
+
+        assert len(matched_patterns) == len(EXPECTED_SAVED_FILES), (
+            "expected: ",
+            EXPECTED_SAVED_FILES,
+            "\n saved: ",
+            list(matched_patterns),
+        )
diff --git a/tests/examples/utils.py b/tests/examples/utils.py
@@ -68,7 +68,10 @@ def copy_and_run_command(
 
 
 def copy_and_run_script(
-    tmp_path: Path, example_dir: str, script_filename: str
+    tmp_path: Path,
+    example_dir: str,
+    script_filename: str,
+    flags: Optional[list[str]] = None,
 ) -> Tuple[List[str], CompletedProcess[str]]:
     """
     Copies the contents of example_dir (relative to the current working directory) to
@@ -81,6 +84,8 @@ def copy_and_run_script(
     :return: subprocess.CompletedProcess object
     """
     command = [sys.executable, script_filename]
+    if flags:
+        command.extend(flags)
     return command, copy_and_run_command(tmp_path, example_dir, command)