diff --git a/src/llmcompressor/__init__.py b/src/llmcompressor/__init__.py index f979a7453..e65cf51b3 100644 --- a/src/llmcompressor/__init__.py +++ b/src/llmcompressor/__init__.py @@ -38,8 +38,6 @@ active_session, callbacks, create_session, - finalize, - initialize, reset_session, ) -from llmcompressor.entrypoints import Oneshot, oneshot +from llmcompressor.entrypoints import Oneshot, oneshot, train diff --git a/src/llmcompressor/core/__init__.py b/src/llmcompressor/core/__init__.py index 47e710943..85a074869 100644 --- a/src/llmcompressor/core/__init__.py +++ b/src/llmcompressor/core/__init__.py @@ -13,8 +13,6 @@ active_session, callbacks, create_session, - finalize, - initialize, reset_session, ) from llmcompressor.core.state import Data, Hardware, ModifiedState, State @@ -35,8 +33,6 @@ "create_session", "active_session", "reset_session", - "initialize", - "finalize", "apply", "callbacks", "LifecycleCallbacks", diff --git a/src/llmcompressor/core/session_functions.py b/src/llmcompressor/core/session_functions.py index 4d12f22ff..b280febbe 100644 --- a/src/llmcompressor/core/session_functions.py +++ b/src/llmcompressor/core/session_functions.py @@ -1,18 +1,15 @@ import threading from contextlib import contextmanager -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional from llmcompressor.core.events import EventType from llmcompressor.core.session import CompressionSession from llmcompressor.core.state import ModifiedState -from llmcompressor.recipe import Recipe __all__ = [ "create_session", "active_session", "reset_session", - "initialize", - "finalize", "callbacks", "LifecycleCallbacks", ] @@ -58,79 +55,6 @@ def reset_session(): session._lifecycle.reset() -def initialize( - recipe: Union[str, List[str], "Recipe", List["Recipe"], None] = None, - recipe_stage: Union[str, List[str], None] = None, - recipe_args: Optional[Dict[str, Any]] = None, - model: Optional[Any] = None, - teacher_model: Optional[Any] = None, - optimizer: Optional[Any] = None, - attach_optim_callbacks: bool = True, - train_data: Optional[Any] = None, - val_data: Optional[Any] = None, - test_data: Optional[Any] = None, - calib_data: Optional[Any] = None, - copy_data: bool = True, - start: Optional[float] = None, - steps_per_epoch: Optional[int] = None, - batches_per_step: Optional[int] = None, - **kwargs, -) -> ModifiedState: - """ - A method to initialize the active session for sparsification - - :param recipe: the recipe to use for the sparsification, can be a path to a - recipe file, a raw recipe string, a recipe object, or a list of recipe objects. - :param recipe_stage: the stage to target for the sparsification - :param recipe_args: the args to use for overriding the recipe defaults - :param model: the model to sparsify - :param teacher_model: the teacher model to use for knowledge distillation - :param optimizer: the optimizer to use for the sparsification - :param attach_optim_callbacks: True to attach the optimizer callbacks to the - sparsification lifecycle, False otherwise - :param train_data: the training data to use for the sparsification - :param val_data: the validation data to use for the sparsification - :param test_data: the testing data to use for the sparsification - :param calib_data: the calibration data to use for the sparsification - :param copy_data: True to copy the data, False otherwise - :param start: the start epoch to use for the sparsification - :param steps_per_epoch: the number of steps per epoch to use for the - sparsification - :param batches_per_step: the number of batches per step to use for - sparsification - :param kwargs: additional kwargs to pass to the lifecycle's initialize method - :return: the modified state of the active session after initializing - """ - return active_session().initialize( - recipe=recipe, - recipe_stage=recipe_stage, - recipe_args=recipe_args, - model=model, - teacher_model=teacher_model, - optimizer=optimizer, - attach_optim_callbacks=attach_optim_callbacks, - train_data=train_data, - val_data=val_data, - test_data=test_data, - calib_data=calib_data, - copy_data=copy_data, - start=start, - steps_per_epoch=steps_per_epoch, - batches_per_step=batches_per_step, - **kwargs, - ) - - -def finalize(**kwargs) -> ModifiedState: - """ - Method to finalize the active session for sparsification - - :param kwargs: additional kwargs to pass to the lifecycle's finalize method - :return: the modified state of the active session after finalizing - """ - return active_session().finalize(**kwargs) - - class LifecycleCallbacks: """ A class for invoking lifecycle events for the active session diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md index 85cb0ce2f..8bc7f7b1a 100644 --- a/src/llmcompressor/entrypoints/README.md +++ b/src/llmcompressor/entrypoints/README.md @@ -1,23 +1,23 @@ -# LLM Compressor Entrypoints +# Compression and Fine-tuning Entrypoint ## Oneshot -Model optimizations compress models while preserving accuracy. One-shot in LLM-Compressor supports faster inference on vLLM by applying post-training quantization (PTQ) or sparsification +An ideal compression technique reduces memory footprint while maintaining accuracy. One-shot in LLM-Compressor supports faster inference on vLLM by applying post-training quantization (PTQ) or sparsification. ### PTQ PTQ is performed to reduce the precision of quantizable weights (e.g., linear layers) to a lower bit-width. Supported formats are: -- W4A16 -- W8A8-INT8 -- W8A8-FP8 +- [W4A16](../../../examples/quantization_w4a16/README.md) +- [W8A8-INT8](../../../examples/quantization_w8a8_int8/README.md) +- [W8A8-FP8](../../../examples/quantization_w8a8_fp8/README.md) ### Sparsification Sparsification reduces model complexity by pruning selected weight values to zero while retaining essential weights in a subset of parameters. Supported formats include: -- 2:4-Sparsity with FP8 Weight, FP8 Input Activation - +- [2:4-Sparsity with FP4 Weight](../../../examples/quantization_2of4_sparse_w4a16/README.md) +- [2:4-Sparsity with FP8 Weight, FP8 Input Activation](../../../examples/sparse_2of4_quantization_fp8/README.md) ## Code -Example scripts for all the above formats are located in the [examples](../../../examples/) folder. A [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: +Example scripts for all the above formats are located in the [examples](../../../examples/) folder. The [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: ```python from transformers import AutoModelForCausalLM, AutoTokenizer @@ -25,17 +25,23 @@ from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.quantization import QuantizationModifier +# Define the model to compress MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +# Load the model model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", torch_dtype="auto" ) +# Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +# Define the recipe, scheme="FP8_DYNAMIC" compresses to W8A8, which is +# FP8 channel-wise for weight, and FP8 dynamic per token activation recipe = QuantizationModifier( targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] ) +# compress the model oneshot(model=model, recipe=recipe) ``` @@ -49,10 +55,21 @@ The high-level description of the argument parser is as follows: - `DatasetArguments`: Arguments for dataset-related configurations, such as calibration dataloaders. - `RecipeArguments`: Arguments for defining and configuring recipes that specify - optimization actions. + parameters for compression. For more information, please check the [README.md](../../llmcompressor/args/README.md) in `src/llmcompressor/args`. +### Saving the Compressed Model + +To save the compressed model, the recommended approach is to specify `output_dir` as the desired destination directory. By default, the model will be saved in a compressed format, reducing its disk space usage upon saving. + +```python +oneshot( + ..., + output_dir="./oneshot_model", # Automatically save the safetensor, config, recipe. Weights are saved in a compressed format +) +``` + ### Lifecycle @@ -64,22 +81,199 @@ The oneshot calibration lifecycle consists of three steps: - Patches the model to include additional functionality for saving with quantization configurations. 2. **Oneshot Calibration**: - - Optimizes the model based on the recipe (instructions for optimizing the model). The + - Compresses the model based on the recipe (instructions for optimizing the model). The recipe defines the `Modifiers` (e.g., `GPTQModifier`, `SparseGPTModifier`) to apply, which contain logic how to quantize or sparsify a model. 3. **Postprocessing**: - Saves the model, tokenizer/processor, and configuration to the specified `output_dir`. -### Saving an Optimized Model +This will automatically save the model weights to a compressed SafeTensors format. The tokenizer/processor, recipe, and the configuration file will also be saved. -To save an optimized model, the recommended approach is to specify `output_dir` in the input argument. For example, to save the model in the `./oneshot_model` directory, +## Train / Finetune +Compressed models can be trained to improve accuracy. Training is carried out using HuggingFace's [Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer). -```python3 -oneshot( - ..., - output_dir="./oneshot_model", +### Finetuning a Compressed Model +LLM-Compressor supports fine-tuning of quantized, sparsified, and sparse-quantized models. It offers both standard fine-tuning, knowledge distillation and SFT Trainer. + +## Code + +### Finetuning + +A compressed model generated using `oneshot` is saved to disk in a compressed format. To load it, the model must be decompressed using `CompressedTensorsConfig` with `AutoModelForCausalLM`. If the above `oneshot` example script was executed and the compressed model was saved to `./oneshot_model`, the following code is used to perform fine-tuning: + + +```python +from transformers.utils.quantization_config import CompressedTensorsConfig + +from llmcompressor import create_session, train + +# The saving directory +output_dir = "./oneshot_model" + +# The model to train +model = AutoModelForCausalLM.from_pretrained( + output_dir, + device_map="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), ) -``` -This will automatically save the model in the SafeTensors format, along with the tokenizer/processor, recipe, and the configuration file. +dataset = "open_platypus" # Define dataset to use for kd +output_dir = "./finetuned_model" +splits = "train[:50%]" # Use 50% of the training data +max_steps = ( + 25 # Number of training steps (updates) before stopping the training process +) +num_calibration_samples = 8 # Number of workers processing datasets in parallel + +# Create an isolated session independent from the previous runs +with create_session(): + train( + model=model, # The model to finetune + dataset=dataset, # The data to carry out finetuning + output_dir=output_dir, # The output directory to save + num_calibration_samples=num_calibration_samples, # The number of workers to carry out dataset processing + splits=splits, # The dataset key and percentage of samples to use + max_steps=max_steps, # The total number of iterations to carry out training + ) +``` + + +### Knowledge Distillation + +To perform knowledge distillation, a teacher model and a student model (the compressed model) must be defined. The loss between the student and the teacher can be specified in the recipe by defining the `comparison` key. In this case, KL divergence is used to compare the output distributions of the student and the teacher. +Comparisons are defined in `/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_factory.py`. + +```python +# Define the teacher model +distill_teacher = AutoModelForCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3-8B-Instruct", + device_map="auto", +) + +# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with +recipe = r""" +kd_stage: + distillation_modifiers: + OutputDistillationModifier: + targets: ["re:model.layers.\\d+$"] + comparison: "kl_divergence" + start: 0 + orig_scale: 1.0 + distill_scale: 1.0 +""" + +# Create an isolated session from the previous runs +with create_session(): + train( + ... + distill_teacher=distill_teacher, # The teacher model + recipe=recipe, # The recipe to use + ) + +``` + +The output terminal will provide the sparsification, quantization and training metrics: + +```bash +2025-02-25T18:39:08.984855-0500 | log_model_sparsification | INFO - There are 8033013760 prunable params which have 0.02% avg sparsity. +2025-02-25T18:39:08.987302-0500 | log_model_sparsification | INFO - There are 8033013760 quantizable params, with a quantization percentage of 86.88%. +***** train metrics ***** + epoch = 0.016 + perplexity = 1.5422 + total_flos = 3221945GF + train_loss = 0.4332 + train_runtime = 0:03:53.39 + train_samples = 12463 + train_samples_per_second = 0.857 + train_steps_per_second = 0.107 +``` + +### End-to-end Script +The end-to-end script for carrying out `oneshot` for `W8A8-FP8` and then knowledge distillation is shown below: + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +from llmcompressor import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + +# The directory for saving +oneshot_output_dir = "./oneshot_model" + +# Load the model +model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto" +) +# Load the tokenizer +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Define the recipe. `scheme="FP8_DYNAMIC"` compresses to W8A8-FP8, which is +# FP8 channel-wise for weight, and FP8 dynamic per token activation +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"] +) + +# compress the model +oneshot(model=model, recipe=recipe, output_dir=oneshot_output_dir) + +from transformers.utils.quantization_config import CompressedTensorsConfig + +from llmcompressor import create_session, train + +# Student model +model = AutoModelForCausalLM.from_pretrained( + oneshot_output_dir, + device_map="auto", + quantization_config=CompressedTensorsConfig(run_compressed=False), +) + +dataset = "open_platypus" # Define dataset to use for knowledge distillation +finetune_output_dir = "./finetuned_model" # The output saving directory +splits = "train[:50%]" # Use 50% of the training data +max_steps = ( + 25 # The number of training steps (updates) before stopping the training process +) +num_calibration_samples = 8 # The number of workers processing datasets in parallel + +# Define teacher model +distill_teacher = AutoModelForCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3-8B-Instruct", + device_map="auto", +) + +# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with +# KL divergence comparison +recipe = r""" +kd_stage: + distillation_modifiers: + OutputDistillationModifier: + targets: ["re:model.layers.\\d+$"] + comparison: "kl_divergence" + start: 0 + orig_scale: 1.0 + distill_scale: 1.0 +""" + +# Create an isolated session from the previous runs +with create_session(): + train( + model=model, # The student model + dataset=dataset, # The data to carry out finetuning + output_dir=finetune_output_dir, # Output directory to save + num_calibration_samples=num_calibration_samples, # The number of workers to carry out dataset processing + splits=splits, # The percentage of the subsets of a dataset to use + max_steps=max_steps, # The number of training steps + distill_teacher=distill_teacher, # The teacher model + recipe=recipe, # The recipe to use + ) +``` + +### SFT Trainer + +TRL's SFT Trainer can be used for sparse fine-tuning or applying sparse knowledge distillation. Examples are available in the `examples/` folder. + +- [Sparse-fine-tune a 50% sparse Llama-7b model](../../../examples/trl_mixin/README.md) +- [Sparse-fine-tune a 50% sparse Llama-7b model using knowledge distillation](../../../examples/trl_mixin/README.md) \ No newline at end of file diff --git a/src/llmcompressor/entrypoints/__init__.py b/src/llmcompressor/entrypoints/__init__.py index 299ab9084..246742508 100644 --- a/src/llmcompressor/entrypoints/__init__.py +++ b/src/llmcompressor/entrypoints/__init__.py @@ -1,3 +1,4 @@ # flake8: noqa from .oneshot import Oneshot, oneshot +from .train import train from .utils import post_process, pre_process diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py new file mode 100644 index 000000000..29d4006b3 --- /dev/null +++ b/src/llmcompressor/entrypoints/train.py @@ -0,0 +1,102 @@ +import math + +from loguru import logger + +from llmcompressor.args import parse_args +from llmcompressor.datasets.utils import get_processed_dataset +from llmcompressor.transformers.finetune.trainer import Trainer + +from .utils import post_process, pre_process + + +def train(**kwargs): + """ + Fine-tuning entrypoint that supports vanilla fine-tuning and + knowledge distillation for compressed model using `oneshot`. + + + This entrypoint is responsible the entire fine-tuning lifecycle, including + preprocessing (model and tokenizer/processor initialization), fine-tuning, + and postprocessing (saving outputs). The intructions for fine-tuning compressed + model can be specified by using a recipe. + + - **Input Keyword Arguments:** + `kwargs` are parsed into: + - `model_args`: Arguments for loading and configuring a pretrained model + (e.g., `AutoModelForCausalLM`). + - `dataset_args`: Arguments for dataset-related configurations, such as + calibration dataloaders. + - `recipe_args`: Arguments for defining and configuring recipes that specify + optimization actions. + - `training_args`: rguments for defining and configuring training parameters + + Parsers are defined in `src/llmcompressor/args/`. + + - **Lifecycle Overview:** + The fine-tuning lifecycle consists of three steps: + 1. **Preprocessing**: + - Instantiates a pretrained model and tokenizer/processor. + - Ensures input and output embedding layers are untied if they share + tensors. + - Patches the model to include additional functionality for saving with + quantization configurations. + 2. **Training**: + - Finetunes the model using a global `CompressionSession` and applies + recipe-defined modifiers (e.g., `ConstantPruningModifier`, + `OutputDistillationModifier`) + 3. **Postprocessing**: + - Saves the model, tokenizer/processor, and configuration to the specified + `output_dir`. + + - **Usage:** + ```python + train(model=model, recipe=recipe, dataset=dataset) + + ``` + + """ + model_args, dataset_args, recipe_args, training_args, _ = parse_args( + include_training_args=True, **kwargs + ) + + pre_process(model_args) + + processed_dataset = get_processed_dataset( + dataset_args=dataset_args, + processor=model_args.processor, + ) + training_dataset = processed_dataset.get("train") + + trainer = Trainer( + model=model_args.model, + teacher=model_args.distill_teacher, + recipe=recipe_args.recipe, + recipe_args=recipe_args.recipe_args, + args=training_args, + model_args=model_args, + dataset_args=dataset_args, + train_dataset=training_dataset, + processing_class=model_args.processor, + data_collator=dataset_args.data_collator, + ) + + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + + logger.info("*** Train ***") + train_result = trainer.train( + resume_from_checkpoint=checkpoint, + ) + + # return output + metrics = train_result.metrics + metrics["train_samples"] = len(training_dataset) + metrics["perplexity"] = math.exp(metrics["train_loss"]) + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + + # this includes saving the state, optimizer and scheduler + trainer.save_model(output_dir=training_args.output_dir) + + post_process(model_args=model_args, output_dir=training_args.output_dir) diff --git a/src/llmcompressor/transformers/finetune/README.md b/src/llmcompressor/transformers/finetune/README.md index e1312b799..f8af96df3 100644 --- a/src/llmcompressor/transformers/finetune/README.md +++ b/src/llmcompressor/transformers/finetune/README.md @@ -1,49 +1,9 @@ # Sparse Finetuning -## Launching from Console Scripts - -### with DataParallel (default) - -```bash -llmcompressor.transformers.text_generation.train - --model PATH_TO_MODEL - --distill_teacher PATH_TO_TEACHER - --dataset DATASET_NAME - --recipe PATH_TO_RECIPE - --output_dir PATH_TO_OUTPUT - --num_train_epochs 1 - --splits "train" -``` - -Also supported: - -* `llmcompressor.transformers.text_generation.finetune` (alias for train) -* `llmcompressor.transformers.text_generation.oneshot` -* `llmcompressor.transformers.text_generation.eval` -* `llmcompressor.transformers.text_generation.apply`(for running in sequential stage mode) -* `llmcompressor.transformers.text_generation.compress` (alias for apply) - -### with FSDP - -```bash -accelerate launch - --config_file example_fsdp_config.yaml - --no_python llmcompressor.transformers.text_generation.finetune - --model PATH_TO_MODEL - --distill_teacher PATH_TO_TEACHER - --dataset DATASET_NAME - --recipe PATH_TO_RECIPE - --output_dir PATH_TO_OUTPUT - --num_train_epochs 1 - --splits "train" -``` - -See [configure_fsdp.md](../../../../examples/finetuning/configure_fsdp.md) for additional instructions on setting up FSDP configuration - ## Launching from Python ```python -from llmcompressor.transformers import train +from llmcompressor import train model = "./obcq_deployment" teacher_model = "Xenova/llama2.c-stories15M" @@ -74,10 +34,10 @@ train( Finetuning arguments are split up into 3 groups: -* ModelArguments: `src/llmcompressor/transformers/utils/arg_parser/model_arguments.py` -* TrainingArguments: `src/llmcompressor/transformers/utils/arg_parser/training_arguments.py` -* DatasetArguments: `src/llmcompressor/transformers/utils/arg_parser/dataset_arguments.py` -* RecipeArguments: `src/llmcompressor/transformers/utils/arg_parser/recipe_arguments.py` +* ModelArguments: `src/llmcompressor/args/model_arguments.py` +* TrainingArguments: `src/llmcompressor/args/training_arguments.py` +* DatasetArguments: `src/llmcompressor/args/dataset_arguments.py` +* RecipeArguments: `src/llmcompressor/args/recipe_arguments.py` ## Running Multi-Stage Recipes @@ -90,9 +50,6 @@ mode. See [example_alternating_recipe.yaml](../../../../examples/finetuning/example_alternating_recipe.yaml) for an example of a staged recipe for Llama. -### Python Example -(This can also be run with FSDP by launching the script as `accelerate launch --config_file example_fsdp_config.yaml test_multi.py`) - test_multi.py ```python from llmcompressor.transformers import apply diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py index f64916e69..20d9ae510 100644 --- a/src/llmcompressor/transformers/finetune/session_mixin.py +++ b/src/llmcompressor/transformers/finetune/session_mixin.py @@ -11,13 +11,7 @@ from transformers.trainer_callback import TrainerState from transformers.trainer_utils import get_last_checkpoint -from llmcompressor.core import ( - active_session, - callbacks, - create_session, - finalize, - initialize, -) +from llmcompressor.core import active_session, callbacks, create_session from llmcompressor.metrics import LoggerManager from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import ( KDModelWrapper, @@ -151,18 +145,20 @@ def initialize_session( self.accelerator.wait_for_everyone() with summon_full_params_context(self.model, offload_to_cpu=True): - initialize( - model=self.model, - teacher_model=self.teacher, # TODO: what about for self/disable? + active_session().initialize( recipe=self.recipe, recipe_stage=stage, recipe_args=self.recipe_args, + model=self.model, + teacher_model=self.teacher, # TODO: what about for self/disable? train_data=train_data, start=epoch, copy_data=False, + attach_optim_callbacks=True, fsdp_active=self.is_fsdp_enabled, metadata=self.metadata, ) + self.accelerator.wait_for_everyone() model = get_session_model() self.model_wrapped = self.model = model @@ -186,7 +182,7 @@ def finalize_session(self): with summon_full_params_context(self.model, offload_to_cpu=True): # in order to update each layer we need to gathers all its parameters - finalize() + active_session().finalize() logger.info("Finalized LLM Compressor session") model = get_session_model() self.model = model @@ -222,7 +218,9 @@ def create_optimizer(self): len(self.train_dataset) / total_batch_size ) - initialize(optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch) + active_session().initialize( + optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch + ) return self.optimizer diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py index 66652b686..1aad787e7 100644 --- a/src/llmcompressor/transformers/finetune/text_generation.py +++ b/src/llmcompressor/transformers/finetune/text_generation.py @@ -17,12 +17,11 @@ # Adapted from https://github.com/huggingface/transformers # vllm-project: no copyright -import warnings + from pathlib import PosixPath from compressed_tensors.utils.helpers import deprecated from loguru import logger -from transformers import HfArgumentParser from llmcompressor.args import ( DatasetArguments, @@ -42,15 +41,6 @@ from llmcompressor.utils.fsdp.helpers import is_fsdp_model -def train(**kwargs): - """ - CLI entrypoint for running training - """ - model_args, dataset_args, recipe_args, training_args = parse_args(**kwargs) - training_args.do_train = True - main(model_args, dataset_args, recipe_args, training_args) - - @deprecated( message=( "`from llmcompressor.transformers import oneshot` is deprecated, " @@ -63,6 +53,18 @@ def oneshot(**kwargs) -> None: oneshot(**kwargs) +@deprecated( + message=( + "`from llmcompressor import train` is deprecated, " + "please use `from llmcompressor import train`." + ) +) +def train(**kwargs): + from llmcompressor import train + + train(**kwargs) + + def apply(**kwargs): """ CLI entrypoint for any of training, oneshot @@ -85,54 +87,6 @@ def compress(**kwargs): apply(**kwargs) -def parse_args(**kwargs): - """ - Parses kwargs by grouping into model, data or training arg groups: - * model_args in - src/llmcompressor/transformers/utils/arg_parser/model_args.py - * dataset_args in - src/llmcompressor/transformers/utils/arg_parser/dataset_args.py - * recipe_args in - src/llmcompressor/transformers/utils/arg_parser/recipe_args.py - * training_args in - src/llmcompressor/transformers/utils/arg_parser/training_args.py - """ - parser = HfArgumentParser( - (ModelArguments, DatasetArguments, RecipeArguments, TrainingArguments) - ) - - if not kwargs: - parsed_args = parser.parse_args_into_dataclasses() - else: - parsed_args = parser.parse_dict(kwargs) - - model_args, dataset_args, recipe_args, training_args = parsed_args - if recipe_args.recipe_args is not None: - if not isinstance(recipe_args.recipe_args, dict): - arg_dict = {} - for recipe_arg in recipe_args.recipe_args: - key, value = recipe_arg.split("=") - arg_dict[key] = value - recipe_args.recipe_args = arg_dict - - # raise depreciation warnings - if dataset_args.remove_columns is not None: - warnings.warn( - "`remove_columns` argument is depreciated. When tokenizing datasets, all " - "columns which are invalid inputs the tokenizer will be removed", - DeprecationWarning, - ) - - # silently assign tokenizer to processor - if model_args.tokenizer: - if model_args.processor: - raise ValueError("Cannot use both a tokenizer and processor") - model_args.processor = model_args.tokenizer - model_args.tokenizer = None - - return model_args, dataset_args, recipe_args, training_args - - def main( model_args: ModelArguments, dataset_args: DatasetArguments, @@ -263,15 +217,6 @@ def main( # exit immediately return - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - stage_runner.train(checkpoint) - # save if model was provided as a string or custom output_dir was set if isinstance(model_args.model, str) or ( training_args.output_dir diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py index 37524069c..2195ae4e6 100644 --- a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py +++ b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py @@ -18,7 +18,7 @@ class TestFinetuneNoRecipeCustomDataset(unittest.TestCase): def _test_finetune_wout_recipe_custom_dataset(self): - from llmcompressor.transformers import train + from llmcompressor import train dataset_path = Path(tempfile.mkdtemp()) diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py index 7facd088e..42eb495d8 100644 --- a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py +++ b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py @@ -20,7 +20,7 @@ def setUp(self): self.output = "./finetune_output" def test_finetune_without_recipe(self): - from llmcompressor.transformers import train + from llmcompressor import train recipe_str = None device = "cuda:0" diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py index e8e0ae426..ec68e1f5d 100644 --- a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py +++ b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py @@ -6,10 +6,9 @@ from transformers import AutoModelForCausalLM from transformers.utils.quantization_config import CompressedTensorsConfig -from llmcompressor import oneshot +from llmcompressor import oneshot, train from llmcompressor.core import create_session from llmcompressor.modifiers.quantization import QuantizationModifier -from llmcompressor.transformers import train @pytest.mark.unit diff --git a/tests/llmcompressor/transformers/finetune/test_safetensors.py b/tests/llmcompressor/transformers/finetune/test_safetensors.py index 84c1bf1b2..462c529e6 100644 --- a/tests/llmcompressor/transformers/finetune/test_safetensors.py +++ b/tests/llmcompressor/transformers/finetune/test_safetensors.py @@ -22,7 +22,7 @@ def setUp(self): self.output = Path("./finetune_output") def test_safetensors(self): - from llmcompressor.transformers import train + from llmcompressor import train device = "cuda:0" output_dir = self.output / "output1" diff --git a/tests/llmcompressor/transformers/test_clear_ml.py b/tests/llmcompressor/transformers/test_clear_ml.py index 4a7922a66..94abd1a62 100644 --- a/tests/llmcompressor/transformers/test_clear_ml.py +++ b/tests/llmcompressor/transformers/test_clear_ml.py @@ -10,7 +10,7 @@ except Exception: is_clearml = False -from llmcompressor.transformers import train +from llmcompressor import train @pytest.mark.skipif(not is_clearml, reason="clearML not installed")