diff --git a/src/llmcompressor/__init__.py b/src/llmcompressor/__init__.py
index f979a7453..e65cf51b3 100644
--- a/src/llmcompressor/__init__.py
+++ b/src/llmcompressor/__init__.py
@@ -38,8 +38,6 @@
     active_session,
     callbacks,
     create_session,
-    finalize,
-    initialize,
     reset_session,
 )
-from llmcompressor.entrypoints import Oneshot, oneshot
+from llmcompressor.entrypoints import Oneshot, oneshot, train
diff --git a/src/llmcompressor/core/__init__.py b/src/llmcompressor/core/__init__.py
index 47e710943..85a074869 100644
--- a/src/llmcompressor/core/__init__.py
+++ b/src/llmcompressor/core/__init__.py
@@ -13,8 +13,6 @@
     active_session,
     callbacks,
     create_session,
-    finalize,
-    initialize,
     reset_session,
 )
 from llmcompressor.core.state import Data, Hardware, ModifiedState, State
@@ -35,8 +33,6 @@
     "create_session",
     "active_session",
     "reset_session",
-    "initialize",
-    "finalize",
     "apply",
     "callbacks",
     "LifecycleCallbacks",
diff --git a/src/llmcompressor/core/session_functions.py b/src/llmcompressor/core/session_functions.py
index 4d12f22ff..b280febbe 100644
--- a/src/llmcompressor/core/session_functions.py
+++ b/src/llmcompressor/core/session_functions.py
@@ -1,18 +1,15 @@
 import threading
 from contextlib import contextmanager
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional
 
 from llmcompressor.core.events import EventType
 from llmcompressor.core.session import CompressionSession
 from llmcompressor.core.state import ModifiedState
-from llmcompressor.recipe import Recipe
 
 __all__ = [
     "create_session",
     "active_session",
     "reset_session",
-    "initialize",
-    "finalize",
     "callbacks",
     "LifecycleCallbacks",
 ]
@@ -58,79 +55,6 @@ def reset_session():
     session._lifecycle.reset()
 
 
-def initialize(
-    recipe: Union[str, List[str], "Recipe", List["Recipe"], None] = None,
-    recipe_stage: Union[str, List[str], None] = None,
-    recipe_args: Optional[Dict[str, Any]] = None,
-    model: Optional[Any] = None,
-    teacher_model: Optional[Any] = None,
-    optimizer: Optional[Any] = None,
-    attach_optim_callbacks: bool = True,
-    train_data: Optional[Any] = None,
-    val_data: Optional[Any] = None,
-    test_data: Optional[Any] = None,
-    calib_data: Optional[Any] = None,
-    copy_data: bool = True,
-    start: Optional[float] = None,
-    steps_per_epoch: Optional[int] = None,
-    batches_per_step: Optional[int] = None,
-    **kwargs,
-) -> ModifiedState:
-    """
-    A method to initialize the active session for sparsification
-
-    :param recipe: the recipe to use for the sparsification, can be a path to a
-        recipe file, a raw recipe string, a recipe object, or a list of recipe objects.
-    :param recipe_stage: the stage to target for the sparsification
-    :param recipe_args: the args to use for overriding the recipe defaults
-    :param model: the model to sparsify
-    :param teacher_model: the teacher model to use for knowledge distillation
-    :param optimizer: the optimizer to use for the sparsification
-    :param attach_optim_callbacks: True to attach the optimizer callbacks to the
-        sparsification lifecycle, False otherwise
-    :param train_data: the training data to use for the sparsification
-    :param val_data: the validation data to use for the sparsification
-    :param test_data: the testing data to use for the sparsification
-    :param calib_data: the calibration data to use for the sparsification
-    :param copy_data: True to copy the data, False otherwise
-    :param start: the start epoch to use for the sparsification
-    :param steps_per_epoch: the number of steps per epoch to use for the
-        sparsification
-    :param batches_per_step: the number of batches per step to use for
-        sparsification
-    :param kwargs: additional kwargs to pass to the lifecycle's initialize method
-    :return: the modified state of the active session after initializing
-    """
-    return active_session().initialize(
-        recipe=recipe,
-        recipe_stage=recipe_stage,
-        recipe_args=recipe_args,
-        model=model,
-        teacher_model=teacher_model,
-        optimizer=optimizer,
-        attach_optim_callbacks=attach_optim_callbacks,
-        train_data=train_data,
-        val_data=val_data,
-        test_data=test_data,
-        calib_data=calib_data,
-        copy_data=copy_data,
-        start=start,
-        steps_per_epoch=steps_per_epoch,
-        batches_per_step=batches_per_step,
-        **kwargs,
-    )
-
-
-def finalize(**kwargs) -> ModifiedState:
-    """
-    Method to finalize the active session for sparsification
-
-    :param kwargs: additional kwargs to pass to the lifecycle's finalize method
-    :return: the modified state of the active session after finalizing
-    """
-    return active_session().finalize(**kwargs)
-
-
 class LifecycleCallbacks:
     """
     A class for invoking lifecycle events for the active session
diff --git a/src/llmcompressor/entrypoints/README.md b/src/llmcompressor/entrypoints/README.md
index 85cb0ce2f..8bc7f7b1a 100644
--- a/src/llmcompressor/entrypoints/README.md
+++ b/src/llmcompressor/entrypoints/README.md
@@ -1,23 +1,23 @@
-# LLM Compressor Entrypoints
+# Compression and Fine-tuning Entrypoint
 
 ## Oneshot
 
-Model optimizations compress models while preserving accuracy. One-shot in LLM-Compressor supports faster inference on vLLM by applying post-training quantization (PTQ) or sparsification
+An ideal compression technique reduces memory footprint while maintaining accuracy. One-shot in LLM-Compressor supports faster inference on vLLM by applying post-training quantization (PTQ) or sparsification.
 
 ### PTQ
 PTQ is performed to reduce the precision of quantizable weights (e.g., linear layers) to a lower bit-width. Supported formats are:
-- W4A16
-- W8A8-INT8 
-- W8A8-FP8
+- [W4A16](../../../examples/quantization_w4a16/README.md)
+- [W8A8-INT8](../../../examples/quantization_w8a8_int8/README.md)
+- [W8A8-FP8](../../../examples/quantization_w8a8_fp8/README.md)
 
 ### Sparsification
 Sparsification reduces model complexity by pruning selected weight values to zero while retaining essential weights in a subset of parameters. Supported formats include:
--  2:4-Sparsity with FP8 Weight, FP8 Input Activation
-
+-  [2:4-Sparsity with FP4 Weight](../../../examples/quantization_2of4_sparse_w4a16/README.md)
+-  [2:4-Sparsity with FP8 Weight, FP8 Input Activation](../../../examples/sparse_2of4_quantization_fp8/README.md)
 
 ## Code
 
-Example scripts for all the above formats are located in the [examples](../../../examples/) folder. A [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: 
+Example scripts for all the above formats are located in the [examples](../../../examples/) folder. The [W8A8-FP8](../../../examples/quantization_w8a8_fp8/llama3_example.py) example is shown below: 
 
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -25,17 +25,23 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
+# Define the model to compress
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 
+# Load the model
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID, device_map="auto", torch_dtype="auto"
 )
+# Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
+# Define the recipe, scheme="FP8_DYNAMIC" compresses to W8A8, which is
+# FP8 channel-wise for weight, and FP8 dynamic per token activation
 recipe = QuantizationModifier(
     targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
 )
 
+# compress the model
 oneshot(model=model, recipe=recipe)
 ```
 
@@ -49,10 +55,21 @@ The high-level description of the argument parser is as follows:
 - `DatasetArguments`: Arguments for dataset-related configurations, such as
     calibration dataloaders.
 - `RecipeArguments`: Arguments for defining and configuring recipes that specify
-    optimization actions.
+    parameters for compression.
 
 For more information, please check the [README.md](../../llmcompressor/args/README.md) in `src/llmcompressor/args`.
 
+### Saving the Compressed Model
+
+To save the compressed model, the recommended approach is to specify `output_dir` as the desired destination directory. By default, the model will be saved in a compressed format, reducing its disk space usage upon saving.
+
+```python
+oneshot(
+    ...,
+    output_dir="./oneshot_model", # Automatically save the safetensor, config, recipe. Weights are saved in a compressed format
+)
+```    
+
 
 ### Lifecycle
 
@@ -64,22 +81,199 @@ The oneshot calibration lifecycle consists of three steps:
     - Patches the model to include additional functionality for saving with
         quantization configurations.
 2. **Oneshot Calibration**:
-    - Optimizes the model based on the recipe (instructions for optimizing the model). The 
+    - Compresses the model based on the recipe (instructions for optimizing the model). The 
         recipe defines the `Modifiers` (e.g., `GPTQModifier`, `SparseGPTModifier`) to apply, which
         contain logic how to quantize or sparsify a model. 
 3. **Postprocessing**:
     - Saves the model, tokenizer/processor, and configuration to the specified
         `output_dir`.
 
-### Saving an Optimized Model
+This will automatically save the model weights to a compressed SafeTensors format. The tokenizer/processor, recipe, and the configuration file will also be saved.
 
-To save an optimized model, the recommended approach is to specify `output_dir` in the input argument. For example, to save the model in the `./oneshot_model` directory,
+## Train / Finetune
+Compressed models can be trained to improve accuracy. Training is carried out using HuggingFace's [Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer).
 
-```python3
-oneshot(
-    ...,
-    output_dir="./oneshot_model",
+### Finetuning a Compressed Model
+LLM-Compressor supports fine-tuning of quantized, sparsified, and sparse-quantized models. It offers both standard fine-tuning, knowledge distillation and SFT Trainer.
+
+## Code
+
+### Finetuning
+
+A compressed model generated using `oneshot` is saved to disk in a compressed format. To load it, the model must be decompressed using `CompressedTensorsConfig` with `AutoModelForCausalLM`. If the above `oneshot` example script was executed and the compressed model was saved to `./oneshot_model`, the following code is used to perform fine-tuning:
+
+
+```python
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+from llmcompressor import create_session, train
+
+# The saving directory
+output_dir = "./oneshot_model"
+
+# The model to train
+model = AutoModelForCausalLM.from_pretrained(
+    output_dir,
+    device_map="auto",
+    quantization_config=CompressedTensorsConfig(run_compressed=False),
 )
-```    
 
-This will automatically save the model in the SafeTensors format, along with the tokenizer/processor, recipe, and the configuration file.
+dataset = "open_platypus"  # Define dataset to use for kd
+output_dir = "./finetuned_model"
+splits = "train[:50%]"  # Use 50% of the training data
+max_steps = (
+    25  # Number of training steps (updates) before stopping the training process
+)
+num_calibration_samples = 8  # Number of workers processing datasets in parallel
+
+# Create an isolated session independent from the previous runs
+with create_session():
+    train(
+        model=model,  # The model to finetune
+        dataset=dataset,  # The data to carry out finetuning
+        output_dir=output_dir,  # The output directory to save
+        num_calibration_samples=num_calibration_samples,  # The number of workers to carry out dataset processing
+        splits=splits,  # The dataset key and percentage of samples to use
+        max_steps=max_steps,  # The total number of iterations to carry out training
+    )
+```
+
+
+### Knowledge Distillation
+
+To perform knowledge distillation, a teacher model and a student model (the compressed model) must be defined. The loss between the student and the teacher can be specified in the recipe by defining the `comparison` key. In this case, KL divergence is used to compare the output distributions of the student and the teacher.
+Comparisons are defined in `/src/llmcompressor/modifiers/distillation/utils/pytorch/kd_factory.py`.
+
+```python
+# Define the teacher model
+distill_teacher = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B-Instruct",  
+    device_map="auto",
+)
+
+# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with
+recipe = r"""
+kd_stage:
+  distillation_modifiers:
+    OutputDistillationModifier:
+        targets: ["re:model.layers.\\d+$"]
+        comparison: "kl_divergence"
+        start: 0
+        orig_scale: 1.0
+        distill_scale: 1.0
+"""
+
+# Create an isolated session from the previous runs
+with create_session():
+    train(
+        ...
+        distill_teacher=distill_teacher,    # The teacher model
+        recipe=recipe,                      # The recipe to use
+    )
+
+```
+
+The output terminal will provide the sparsification, quantization and training metrics:
+
+```bash
+2025-02-25T18:39:08.984855-0500 | log_model_sparsification | INFO - There are 8033013760 prunable params which have 0.02% avg sparsity.
+2025-02-25T18:39:08.987302-0500 | log_model_sparsification | INFO - There are 8033013760 quantizable params, with a quantization percentage of 86.88%.
+***** train metrics *****
+  epoch                    =      0.016
+  perplexity               =     1.5422
+  total_flos               =  3221945GF
+  train_loss               =     0.4332
+  train_runtime            = 0:03:53.39
+  train_samples            =      12463
+  train_samples_per_second =      0.857
+  train_steps_per_second   =      0.107
+```
+
+### End-to-end Script 
+The end-to-end script for carrying out `oneshot` for `W8A8-FP8` and then knowledge distillation is shown below:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
+# The directory for saving
+oneshot_output_dir = "./oneshot_model"
+
+# Load the model
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto"
+)
+# Load the tokenizer
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Define the recipe. `scheme="FP8_DYNAMIC"` compresses to W8A8-FP8, which is
+# FP8 channel-wise for weight, and FP8 dynamic per token activation
+recipe = QuantizationModifier(
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
+
+# compress the model
+oneshot(model=model, recipe=recipe, output_dir=oneshot_output_dir)
+
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+from llmcompressor import create_session, train
+
+# Student model
+model = AutoModelForCausalLM.from_pretrained(
+    oneshot_output_dir,
+    device_map="auto",
+    quantization_config=CompressedTensorsConfig(run_compressed=False),
+)
+
+dataset = "open_platypus"  # Define dataset to use for knowledge distillation
+finetune_output_dir = "./finetuned_model"  # The output saving directory
+splits = "train[:50%]"  # Use 50% of the training data
+max_steps = (
+    25  # The number of training steps (updates) before stopping the training process
+)
+num_calibration_samples = 8  # The number of workers processing datasets in parallel
+
+# Define teacher model
+distill_teacher = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    device_map="auto",
+)
+
+# Define the recipe, use knowledge distillation modifier and target the `model.layers` using a regex with
+# KL divergence comparison
+recipe = r"""
+kd_stage:
+  distillation_modifiers:
+    OutputDistillationModifier:
+        targets: ["re:model.layers.\\d+$"]
+        comparison: "kl_divergence"
+        start: 0
+        orig_scale: 1.0
+        distill_scale: 1.0
+"""
+
+# Create an isolated session from the previous runs
+with create_session():
+    train(
+        model=model,  # The student model
+        dataset=dataset,  # The data to carry out finetuning
+        output_dir=finetune_output_dir,  # Output directory to save
+        num_calibration_samples=num_calibration_samples,  # The number of workers to carry out dataset processing
+        splits=splits,  # The percentage of the subsets of a dataset to use
+        max_steps=max_steps,  # The number of training steps
+        distill_teacher=distill_teacher,  # The teacher model
+        recipe=recipe,  # The recipe to use
+    )
+```
+
+### SFT Trainer
+
+TRL's SFT Trainer can be used for sparse fine-tuning or applying sparse knowledge distillation. Examples are available in the `examples/` folder.
+
+- [Sparse-fine-tune a 50% sparse Llama-7b model](../../../examples/trl_mixin/README.md)
+- [Sparse-fine-tune a 50% sparse Llama-7b model using knowledge distillation](../../../examples/trl_mixin/README.md)
\ No newline at end of file
diff --git a/src/llmcompressor/entrypoints/__init__.py b/src/llmcompressor/entrypoints/__init__.py
index 299ab9084..246742508 100644
--- a/src/llmcompressor/entrypoints/__init__.py
+++ b/src/llmcompressor/entrypoints/__init__.py
@@ -1,3 +1,4 @@
 # flake8: noqa
 from .oneshot import Oneshot, oneshot
+from .train import train
 from .utils import post_process, pre_process
diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py
new file mode 100644
index 000000000..29d4006b3
--- /dev/null
+++ b/src/llmcompressor/entrypoints/train.py
@@ -0,0 +1,102 @@
+import math
+
+from loguru import logger
+
+from llmcompressor.args import parse_args
+from llmcompressor.datasets.utils import get_processed_dataset
+from llmcompressor.transformers.finetune.trainer import Trainer
+
+from .utils import post_process, pre_process
+
+
+def train(**kwargs):
+    """
+    Fine-tuning entrypoint that supports vanilla fine-tuning and
+    knowledge distillation for compressed model using `oneshot`.
+
+
+    This entrypoint is responsible the entire fine-tuning lifecycle, including
+    preprocessing (model and tokenizer/processor initialization), fine-tuning,
+    and postprocessing (saving outputs). The intructions for fine-tuning compressed
+    model can be specified by using a recipe.
+
+    - **Input Keyword Arguments:**
+        `kwargs` are parsed into:
+        - `model_args`: Arguments for loading and configuring a pretrained model
+          (e.g., `AutoModelForCausalLM`).
+        - `dataset_args`: Arguments for dataset-related configurations, such as
+          calibration dataloaders.
+        - `recipe_args`: Arguments for defining and configuring recipes that specify
+          optimization actions.
+        - `training_args`: rguments for defining and configuring training parameters
+
+        Parsers are defined in `src/llmcompressor/args/`.
+
+    - **Lifecycle Overview:**
+        The fine-tuning lifecycle consists of three steps:
+        1. **Preprocessing**:
+            - Instantiates a pretrained model and tokenizer/processor.
+            - Ensures input and output embedding layers are untied if they share
+              tensors.
+            - Patches the model to include additional functionality for saving with
+              quantization configurations.
+        2. **Training**:
+            - Finetunes the model using a global `CompressionSession` and applies
+              recipe-defined modifiers (e.g., `ConstantPruningModifier`,
+                `OutputDistillationModifier`)
+        3. **Postprocessing**:
+            - Saves the model, tokenizer/processor, and configuration to the specified
+              `output_dir`.
+
+    - **Usage:**
+        ```python
+        train(model=model, recipe=recipe, dataset=dataset)
+
+        ```
+
+    """
+    model_args, dataset_args, recipe_args, training_args, _ = parse_args(
+        include_training_args=True, **kwargs
+    )
+
+    pre_process(model_args)
+
+    processed_dataset = get_processed_dataset(
+        dataset_args=dataset_args,
+        processor=model_args.processor,
+    )
+    training_dataset = processed_dataset.get("train")
+
+    trainer = Trainer(
+        model=model_args.model,
+        teacher=model_args.distill_teacher,
+        recipe=recipe_args.recipe,
+        recipe_args=recipe_args.recipe_args,
+        args=training_args,
+        model_args=model_args,
+        dataset_args=dataset_args,
+        train_dataset=training_dataset,
+        processing_class=model_args.processor,
+        data_collator=dataset_args.data_collator,
+    )
+
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+
+    logger.info("*** Train ***")
+    train_result = trainer.train(
+        resume_from_checkpoint=checkpoint,
+    )
+
+    # return output
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(training_dataset)
+    metrics["perplexity"] = math.exp(metrics["train_loss"])
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+
+    # this includes saving the state, optimizer and scheduler
+    trainer.save_model(output_dir=training_args.output_dir)
+
+    post_process(model_args=model_args, output_dir=training_args.output_dir)
diff --git a/src/llmcompressor/transformers/finetune/README.md b/src/llmcompressor/transformers/finetune/README.md
index e1312b799..f8af96df3 100644
--- a/src/llmcompressor/transformers/finetune/README.md
+++ b/src/llmcompressor/transformers/finetune/README.md
@@ -1,49 +1,9 @@
 # Sparse Finetuning
 
-## Launching from Console Scripts
-
-### with DataParallel (default)
-
-```bash
-llmcompressor.transformers.text_generation.train
-    --model PATH_TO_MODEL
-    --distill_teacher PATH_TO_TEACHER
-    --dataset DATASET_NAME
-    --recipe PATH_TO_RECIPE
-    --output_dir PATH_TO_OUTPUT
-    --num_train_epochs 1
-    --splits "train"
-```
-
-Also supported:
-
-* `llmcompressor.transformers.text_generation.finetune` (alias for train)
-* `llmcompressor.transformers.text_generation.oneshot`
-* `llmcompressor.transformers.text_generation.eval`
-* `llmcompressor.transformers.text_generation.apply`(for running in sequential stage mode)
-* `llmcompressor.transformers.text_generation.compress` (alias for apply)
-
-### with FSDP
-
-```bash
-accelerate launch 
-    --config_file example_fsdp_config.yaml 
-    --no_python llmcompressor.transformers.text_generation.finetune
-    --model PATH_TO_MODEL
-    --distill_teacher PATH_TO_TEACHER
-    --dataset DATASET_NAME
-    --recipe PATH_TO_RECIPE
-    --output_dir PATH_TO_OUTPUT
-    --num_train_epochs 1
-    --splits "train"
-```
-
-See [configure_fsdp.md](../../../../examples/finetuning/configure_fsdp.md) for additional instructions on setting up FSDP configuration
-
 ## Launching from Python
 
 ```python
-from llmcompressor.transformers import train
+from llmcompressor import train
 
 model = "./obcq_deployment"
 teacher_model = "Xenova/llama2.c-stories15M"
@@ -74,10 +34,10 @@ train(
 
 Finetuning arguments are split up into 3 groups:
 
-* ModelArguments: `src/llmcompressor/transformers/utils/arg_parser/model_arguments.py`
-* TrainingArguments: `src/llmcompressor/transformers/utils/arg_parser/training_arguments.py`
-* DatasetArguments: `src/llmcompressor/transformers/utils/arg_parser/dataset_arguments.py`
-* RecipeArguments: `src/llmcompressor/transformers/utils/arg_parser/recipe_arguments.py`
+* ModelArguments: `src/llmcompressor/args/model_arguments.py`
+* TrainingArguments: `src/llmcompressor/args/training_arguments.py`
+* DatasetArguments: `src/llmcompressor/args/dataset_arguments.py`
+* RecipeArguments: `src/llmcompressor/args/recipe_arguments.py`
 
 
 ## Running Multi-Stage Recipes
@@ -90,9 +50,6 @@ mode.
 See [example_alternating_recipe.yaml](../../../../examples/finetuning/example_alternating_recipe.yaml) for an example 
 of a staged recipe for Llama. 
 
-### Python Example
-(This can also be run with FSDP by launching the script as `accelerate launch --config_file example_fsdp_config.yaml test_multi.py`)
-
 test_multi.py
 ```python
 from llmcompressor.transformers import apply
diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
index f64916e69..20d9ae510 100644
--- a/src/llmcompressor/transformers/finetune/session_mixin.py
+++ b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -11,13 +11,7 @@
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_utils import get_last_checkpoint
 
-from llmcompressor.core import (
-    active_session,
-    callbacks,
-    create_session,
-    finalize,
-    initialize,
-)
+from llmcompressor.core import active_session, callbacks, create_session
 from llmcompressor.metrics import LoggerManager
 from llmcompressor.modifiers.distillation.utils.pytorch.model_wrapper import (
     KDModelWrapper,
@@ -151,18 +145,20 @@ def initialize_session(
 
         self.accelerator.wait_for_everyone()
         with summon_full_params_context(self.model, offload_to_cpu=True):
-            initialize(
-                model=self.model,
-                teacher_model=self.teacher,  # TODO: what about for self/disable?
+            active_session().initialize(
                 recipe=self.recipe,
                 recipe_stage=stage,
                 recipe_args=self.recipe_args,
+                model=self.model,
+                teacher_model=self.teacher,  # TODO: what about for self/disable?
                 train_data=train_data,
                 start=epoch,
                 copy_data=False,
+                attach_optim_callbacks=True,
                 fsdp_active=self.is_fsdp_enabled,
                 metadata=self.metadata,
             )
+
         self.accelerator.wait_for_everyone()
         model = get_session_model()
         self.model_wrapped = self.model = model
@@ -186,7 +182,7 @@ def finalize_session(self):
 
         with summon_full_params_context(self.model, offload_to_cpu=True):
             # in order to update each layer we need to gathers all its parameters
-            finalize()
+            active_session().finalize()
         logger.info("Finalized LLM Compressor session")
         model = get_session_model()
         self.model = model
@@ -222,7 +218,9 @@ def create_optimizer(self):
                 len(self.train_dataset) / total_batch_size
             )
 
-        initialize(optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch)
+        active_session().initialize(
+            optimizer=self.optimizer, steps_per_epoch=self.total_steps_per_epoch
+        )
 
         return self.optimizer
 
diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
index 66652b686..1aad787e7 100644
--- a/src/llmcompressor/transformers/finetune/text_generation.py
+++ b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -17,12 +17,11 @@
 # Adapted from https://github.com/huggingface/transformers
 # vllm-project: no copyright
 
-import warnings
+
 from pathlib import PosixPath
 
 from compressed_tensors.utils.helpers import deprecated
 from loguru import logger
-from transformers import HfArgumentParser
 
 from llmcompressor.args import (
     DatasetArguments,
@@ -42,15 +41,6 @@
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
 
-def train(**kwargs):
-    """
-    CLI entrypoint for running training
-    """
-    model_args, dataset_args, recipe_args, training_args = parse_args(**kwargs)
-    training_args.do_train = True
-    main(model_args, dataset_args, recipe_args, training_args)
-
-
 @deprecated(
     message=(
         "`from llmcompressor.transformers import oneshot` is deprecated, "
@@ -63,6 +53,18 @@ def oneshot(**kwargs) -> None:
     oneshot(**kwargs)
 
 
+@deprecated(
+    message=(
+        "`from llmcompressor import train` is deprecated, "
+        "please use `from llmcompressor import train`."
+    )
+)
+def train(**kwargs):
+    from llmcompressor import train
+
+    train(**kwargs)
+
+
 def apply(**kwargs):
     """
     CLI entrypoint for any of training, oneshot
@@ -85,54 +87,6 @@ def compress(**kwargs):
     apply(**kwargs)
 
 
-def parse_args(**kwargs):
-    """
-    Parses kwargs by grouping into model, data or training arg groups:
-        * model_args in
-            src/llmcompressor/transformers/utils/arg_parser/model_args.py
-        * dataset_args in
-            src/llmcompressor/transformers/utils/arg_parser/dataset_args.py
-        * recipe_args in
-            src/llmcompressor/transformers/utils/arg_parser/recipe_args.py
-        * training_args in
-            src/llmcompressor/transformers/utils/arg_parser/training_args.py
-    """
-    parser = HfArgumentParser(
-        (ModelArguments, DatasetArguments, RecipeArguments, TrainingArguments)
-    )
-
-    if not kwargs:
-        parsed_args = parser.parse_args_into_dataclasses()
-    else:
-        parsed_args = parser.parse_dict(kwargs)
-
-    model_args, dataset_args, recipe_args, training_args = parsed_args
-    if recipe_args.recipe_args is not None:
-        if not isinstance(recipe_args.recipe_args, dict):
-            arg_dict = {}
-            for recipe_arg in recipe_args.recipe_args:
-                key, value = recipe_arg.split("=")
-                arg_dict[key] = value
-            recipe_args.recipe_args = arg_dict
-
-    # raise depreciation warnings
-    if dataset_args.remove_columns is not None:
-        warnings.warn(
-            "`remove_columns` argument is depreciated. When tokenizing datasets, all "
-            "columns which are invalid inputs the tokenizer will be removed",
-            DeprecationWarning,
-        )
-
-    # silently assign tokenizer to processor
-    if model_args.tokenizer:
-        if model_args.processor:
-            raise ValueError("Cannot use both a tokenizer and processor")
-        model_args.processor = model_args.tokenizer
-    model_args.tokenizer = None
-
-    return model_args, dataset_args, recipe_args, training_args
-
-
 def main(
     model_args: ModelArguments,
     dataset_args: DatasetArguments,
@@ -263,15 +217,6 @@ def main(
         # exit immediately
         return
 
-    # Training
-    if training_args.do_train:
-        checkpoint = None
-        if training_args.resume_from_checkpoint is not None:
-            checkpoint = training_args.resume_from_checkpoint
-        elif last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        stage_runner.train(checkpoint)
-
     # save if model was provided as a string or custom output_dir was set
     if isinstance(model_args.model, str) or (
         training_args.output_dir
diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
index 37524069c..2195ae4e6 100644
--- a/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
+++ b/tests/llmcompressor/transformers/finetune/test_finetune_no_recipe_custom_dataset.py
@@ -18,7 +18,7 @@
 
 class TestFinetuneNoRecipeCustomDataset(unittest.TestCase):
     def _test_finetune_wout_recipe_custom_dataset(self):
-        from llmcompressor.transformers import train
+        from llmcompressor import train
 
         dataset_path = Path(tempfile.mkdtemp())
 
diff --git a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py
index 7facd088e..42eb495d8 100644
--- a/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py
+++ b/tests/llmcompressor/transformers/finetune/test_finetune_without_recipe.py
@@ -20,7 +20,7 @@ def setUp(self):
         self.output = "./finetune_output"
 
     def test_finetune_without_recipe(self):
-        from llmcompressor.transformers import train
+        from llmcompressor import train
 
         recipe_str = None
         device = "cuda:0"
diff --git a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
index e8e0ae426..ec68e1f5d 100644
--- a/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
+++ b/tests/llmcompressor/transformers/finetune/test_oneshot_then_finetune.py
@@ -6,10 +6,9 @@
 from transformers import AutoModelForCausalLM
 from transformers.utils.quantization_config import CompressedTensorsConfig
 
-from llmcompressor import oneshot
+from llmcompressor import oneshot, train
 from llmcompressor.core import create_session
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import train
 
 
 @pytest.mark.unit
diff --git a/tests/llmcompressor/transformers/finetune/test_safetensors.py b/tests/llmcompressor/transformers/finetune/test_safetensors.py
index 84c1bf1b2..462c529e6 100644
--- a/tests/llmcompressor/transformers/finetune/test_safetensors.py
+++ b/tests/llmcompressor/transformers/finetune/test_safetensors.py
@@ -22,7 +22,7 @@ def setUp(self):
         self.output = Path("./finetune_output")
 
     def test_safetensors(self):
-        from llmcompressor.transformers import train
+        from llmcompressor import train
 
         device = "cuda:0"
         output_dir = self.output / "output1"
diff --git a/tests/llmcompressor/transformers/test_clear_ml.py b/tests/llmcompressor/transformers/test_clear_ml.py
index 4a7922a66..94abd1a62 100644
--- a/tests/llmcompressor/transformers/test_clear_ml.py
+++ b/tests/llmcompressor/transformers/test_clear_ml.py
@@ -10,7 +10,7 @@
 except Exception:
     is_clearml = False
 
-from llmcompressor.transformers import train
+from llmcompressor import train
 
 
 @pytest.mark.skipif(not is_clearml, reason="clearML not installed")