WIP

kylesayrs · kylesayrs · commit de38a64005e6 · 2025-01-30T23:52:43.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -5,7 +5,8 @@
 from llmcompressor.transformers import oneshot
 
 # Select model and load it.
-MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+#MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"
 
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
@@ -22,6 +23,7 @@
 # Increasing the number of samples can improve accuracy.
 NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
+BATCH_SIZE = 2
 
 # Load dataset and preprocess.
 ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
@@ -64,6 +66,7 @@ def tokenize(sample):
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    per_device_oneshot_batch_size=BATCH_SIZE,
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -254,6 +254,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                 )
             if isinstance(exception, unfixable_errors):
                 raise exception
+            
+            raise exception
 
             warnings.warn("Falling back to layer_sequential pipeline")
             try:
diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py
@@ -71,6 +71,7 @@ def trace_subgraphs(
     concrete_args = populate_concrete_args(model, sample_input)
 
     # trace
+    breakpoint()
     with (
         calibration_forward_context(model),
         HooksMixin.disable_hooks(),
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
@@ -53,11 +53,7 @@ def __init__(
         self.tokenizer = getattr(self.processor, "tokenizer", self.processor)
 
         if self.tokenizer is not None:
-            # fill in pad token
-            if not self.tokenizer.pad_token:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-
-            # configure sequence length
+            # resolve sequence length
             max_seq_length = data_args.max_seq_length
             if data_args.max_seq_length > self.tokenizer.model_max_length:
                 logger.warning(
@@ -69,7 +65,7 @@ def __init__(
                 data_args.max_seq_length, self.tokenizer.model_max_length
             )
 
-            # configure padding
+            # resolve padding
             self.padding = (
                 False
                 if self.data_args.concatenate_data
diff --git a/src/llmcompressor/transformers/finetune/data/data_helpers.py b/src/llmcompressor/transformers/finetune/data/data_helpers.py
@@ -30,7 +30,7 @@ def format_calibration_data(
     batch_size: int = 1,
     do_shuffle: bool = True,
     processor: Optional[Processor] = None,
-    collate_fn: Callable = default_data_collator,
+    collate_fn: Optional[Callable] = None,
     accelerator: Optional[Any] = None,
 ) -> List[torch.Tensor]:
     """
@@ -41,7 +41,9 @@ def format_calibration_data(
     :param num_calibration_samples: number of data samples to convert
     :param do_shuffle: whether to shuffle the dataset before selecting calibration
     samples, true by default
-    :param collate_fn: optional custom collate function, or use default
+    :param collate_fn: optional custom collate function, defaults to
+        `DataCollatorWithPadding` if None is provided. uses . If the tokenizer fails to
+        resolve, then `default_data_collator` is used
     :param accelerator: optional accelerator for if preparing in FSDP mode
     :return: list of trimmed calibration data tensors
     """
@@ -61,16 +63,17 @@ def format_calibration_data(
     tokenized_calibration = tokenized_dataset.select(range(safe_calibration_samples))
 
     # collate data
+    breakpoint()
     if collate_fn is None:
         tokenizer = getattr(processor, "tokenizer", processor)
-        if tokenizer is None:
+        if hasattr(tokenizer, "pad"):
+            collate_fn = DataCollatorWithPadding(tokenizer)
+        else:
             warnings.warn(
                 "Could not find processor, attempting to collate with without padding "
                 "(may fail for batch_size > 1)"
             )
-            return default_data_collator()
-
-        collate_fn = DataCollatorWithPadding(tokenizer)
+            collate_fn = default_data_collator
 
     dataloader_params = {
         "batch_size": batch_size,
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
@@ -49,14 +49,15 @@ def __init__(
         data_args: "DataTrainingArguments",
         model_args: "ModelArguments",
         training_args: "TrainingArguments",
+        processor: Processor,
     ):
         self._data_args = data_args
         self._model_args = model_args
         self._training_args = training_args
 
         self.datasets = {}
         self.trainer = None
-        self.processor = None
+        self.processor = processor
         self.parent_output_dir = self._training_args.output_dir
         self._output_dir = self._training_args.output_dir
 
@@ -68,8 +69,8 @@ def populate_datasets(self, processor: Processor, add_labels: bool = True):
         :param processor: processor or tokenizer to use for dataset tokenization
         :param add_labels: if True, add labels column to dataset splits
         """
+        # TODO: remove `processor` arg in favor of self.processor
         if self._data_args.dataset is None:
-            self.processor = self._model_args.processor
             logger.info(
                 "Running oneshot without calibration data. This is expected for "
                 "weight-only and dynamic quantization"
@@ -110,7 +111,7 @@ def _get_split_name(inp_str):
                     registry_id,
                     data_args=self._data_args,
                     split=split_str,
-                    processor=processor,
+                    processor=self.processor,
                 )
                 tokenized_datasets[split_name] = dataset_manager(add_labels=add_labels)
 
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -20,6 +20,7 @@
 import os
 import warnings
 from pathlib import PosixPath
+from types import NoneType
 
 from loguru import logger
 from transformers import (
@@ -286,6 +287,20 @@ def initialize_processor_from_path(
     return processor
 
 
+def configure_processor(processor: Processor):
+    # configure tokenizer pad_token, required for padding and data collation
+    tokenizer = getattr(processor, "tokenizer", processor)
+    if getattr(tokenizer, "pad_token", None) is None:
+        if hasattr(tokenizer, "eos_token"):
+            logger.debug("Tokenizer is missing pad_token, using eos_token instead")
+            tokenizer.pad_token = tokenizer.eos_token
+        else:
+            logger.debug(
+                "Tokenizer is missing pad_token and eos_token, this may lead to issues "
+                " when padding"
+            )
+
+
 def main(
     model_args: ModelArguments,
     data_args: DataTrainingArguments,
@@ -361,8 +376,9 @@ def main(
         teacher.eval()
 
     processor = model_args.processor
-    if isinstance(processor, str) or processor is None:
+    if isinstance(processor, (str, NoneType)):
         processor = initialize_processor_from_path(model_args, model, teacher)
+    configure_processor(processor)
 
     pre_initialize_structure(model=model)
 
@@ -371,10 +387,12 @@ def main(
 
     # Load datasets
     stage_runner = StageRunner(
-        model_args=model_args, data_args=data_args, training_args=training_args
+        model_args=model_args, data_args=data_args, training_args=training_args, processor=processor
     )
     add_labels = training_args.do_train or training_args.run_stages
-    stage_runner.populate_datasets(processor=processor, add_labels=add_labels)
+    stage_runner.populate_datasets(
+        processor=processor, add_labels=add_labels
+    )
     train_dataset = stage_runner.get_dataset_split("train")
     eval_dataset = stage_runner.get_dataset_split("validation")
     calib_dataset = stage_runner.get_dataset_split("calibration")

Original file line number	Diff line number	Diff line change
`@@ -254,6 +254,8 @@ def on_initialize(self, state: State, **kwargs) -> bool:`
`254`	`254`	`)`
`255`	`255`	`if isinstance(exception, unfixable_errors):`
`256`	`256`	`raise exception`
	`257`	`+`
	`258`	`+ raise exception`
`257`	`259`
`258`	`260`	`warnings.warn("Falling back to layer_sequential pipeline")`
`259`	`261`	`try:`