From 6ec5767a48903419a0413fec34183dc4b1c9dc52 Mon Sep 17 00:00:00 2001
From: Hitesh Shah <hiteshshah@microsoft.com>
Date: Fri, 13 Sep 2024 20:07:53 +0000
Subject: [PATCH] Quantize: CLI command to quantize input model

Usage:
 olive quantize --m <model-name> --device <cpu|gpu> --algorithms <awq,gptq> --data_config_path <file-name> -o <output-folder>

Few other code improvements:
* Moved global function is cli/base.py to be static members of
  cli/base/BaseOliveCLICommand to avoid multiple imports in each cli command
  implementation. Moreover these functions are only useable in the context of
  cli command implementation anyways.
* Created new new functions (add_data_config_options, add_hf_dataset_options,
  and add_accelerator_options) to cli/base/BaseOliveCLICommand to avoid code
  duplication and standardization across different cli command implementations.
---
 olive/cli/base.py               | 475 +++++++++++++++++++++++---------
 olive/cli/capture_onnx.py       |  30 +-
 olive/cli/export_adapters.py    |   2 +-
 olive/cli/finetune.py           |  88 ++----
 olive/cli/launcher.py           |   2 +
 olive/cli/manage_aml_compute.py |  18 +-
 olive/cli/perf_tuning.py        | 123 ++-------
 olive/cli/quantize.py           | 153 ++++++++++
 olive/common/hf/mappings.py     |  13 +
 olive/passes/pytorch/gptq.py    |  31 ++-
 test/unit_test/cli/test_cli.py  |  42 +++
 11 files changed, 632 insertions(+), 345 deletions(-)
 create mode 100644 olive/cli/quantize.py
diff --git a/olive/cli/base.py b/olive/cli/base.py
index e697fff3e..37e63f2c8 100644
--- a/olive/cli/base.py
+++ b/olive/cli/base.py
@@ -16,7 +16,7 @@
 import yaml
 
 from olive.cli.constants import CONDA_CONFIG
-from olive.common.utils import hash_dict
+from olive.common.utils import hash_dict, set_nested_dict_value, unescaped_str
 
 
 class BaseOliveCLICommand(ABC):
@@ -38,132 +38,353 @@ def register_subcommand(parser: ArgumentParser):
     def run(self):
         raise NotImplementedError
 
+    @staticmethod
+    def _get_model_name_or_path(model_name_or_path) -> Union[str, Dict[str, str]]:
+        pattern = r"^(?P<registry_name>[^:]+):(?P<model_name>[^:]+):(?P<version>[^:]+)$"
+        match = re.match(pattern, model_name_or_path)
+
+        if match:
+            return {
+                "type": "azureml_registry_model",
+                "registry_name": match.group("registry_name"),
+                "name": match.group("model_name"),
+                "version": match.group("version"),
+            }
+
+        pattern = r"https://huggingface\.co/([^/]+/[^/]+)(?:/.*)?"
+        match = re.search(pattern, model_name_or_path)
+
+        if match:
+            return match.group(1)
+
+        return model_name_or_path
 
-def get_model_name_or_path(model_name_or_path) -> Union[str, Dict[str, str]]:
-    pattern = r"^(?P<registry_name>[^:]+):(?P<model_name>[^:]+):(?P<version>[^:]+)$"
-    match = re.match(pattern, model_name_or_path)
-
-    if match:
-        return {
-            "type": "azureml_registry_model",
-            "registry_name": match.group("registry_name"),
-            "name": match.group("model_name"),
-            "version": match.group("version"),
-        }
-
-    pattern = r"https://huggingface\.co/([^/]+/[^/]+)(?:/.*)?"
-    match = re.search(pattern, model_name_or_path)
-
-    if match:
-        return match.group(1)
-
-    return model_name_or_path
-
-
-def add_logging_options(sub_parser):
-    log_group = sub_parser.add_argument_group("logging options")
-    log_group.add_argument(
-        "--log_level",
-        type=int,
-        default=3,
-        help="Logging level. Default is 3. level 0: DEBUG, 1: INFO, 2: WARNING, 3: ERROR, 4: CRITICAL",
-    )
-
-
-def add_remote_options(sub_parser):
-    remote_group = sub_parser.add_argument_group("remote options")
-    remote_group.add_argument(
-        "--resource_group",
-        type=str,
-        required=False,
-        help="Resource group for the AzureML workspace.",
-    )
-    remote_group.add_argument(
-        "--workspace_name",
-        type=str,
-        required=False,
-        help="Workspace name for the AzureML workspace.",
-    )
-    remote_group.add_argument(
-        "--keyvault_name",
-        type=str,
-        required=False,
-        help=(
-            "The azureml keyvault name with huggingface token to use for remote run. Refer to"
-            " https://microsoft.github.io/Olive/features/huggingface_model_optimization.html#huggingface-login for"
-            " more details."
-        ),
-    )
-    remote_group.add_argument(
-        "--aml_compute",
-        type=str,
-        required=False,
-        help="The compute name to run the workflow on.",
-    )
-
-
-def add_hf_model_options(sub_parser):
-    model_group = sub_parser.add_argument_group("model options")
-    model_group.add_argument(
-        "-m",
-        "--model_name_or_path",
-        type=str,
-        required=True,
-        help=(
-            "The model checkpoint for weights initialization. If using an AzureML Registry model, provide the model"
-            " path as 'registry_name:model_name:version'."
-        ),
-    )
-    model_group.add_argument("--trust_remote_code", action="store_true", help="Trust remote code when loading a model.")
-    model_group.add_argument("-t", "--task", type=str, help="Task for which the model is used.")
-
-
-def is_remote_run(args):
-    return all([args.resource_group, args.workspace_name, args.aml_compute])
-
-
-def update_remote_option(config, args, cli_action, tempdir):
-    if args.resource_group or args.workspace_name or args.aml_compute:
-        if not is_remote_run(args):
-            raise ValueError("resource_group, workspace_name and aml_compute are required for remote workflow run.")
-
-        config["workflow_id"] = f"{cli_action}-{hash_dict(config)}"
-
-        try:
-            subscription_id = json.loads(subprocess.check_output("az account show", shell=True).decode("utf-8"))["id"]
-            print("Using Azure subscription ID: %s", subscription_id)
-
-        except subprocess.CalledProcessError:
-            print(
-                "Error: Unable to retrieve account information. "
-                "Make sure you are logged in to Azure CLI with command `az login`."
+    @staticmethod
+    def _add_logging_options(sub_parser):
+        log_group = sub_parser.add_argument_group("logging options")
+        log_group.add_argument(
+            "--log_level",
+            type=int,
+            default=3,
+            help="Logging level. Default is 3. level 0: DEBUG, 1: INFO, 2: WARNING, 3: ERROR, 4: CRITICAL",
+        )
+        return log_group
+
+    @staticmethod
+    def _add_remote_options(sub_parser):
+        remote_group = sub_parser.add_argument_group("remote options")
+        remote_group.add_argument(
+            "--resource_group",
+            type=str,
+            required=False,
+            help="Resource group for the AzureML workspace.",
+        )
+        remote_group.add_argument(
+            "--workspace_name",
+            type=str,
+            required=False,
+            help="Workspace name for the AzureML workspace.",
+        )
+        remote_group.add_argument(
+            "--keyvault_name",
+            type=str,
+            required=False,
+            help=(
+                "The azureml keyvault name with huggingface token to use for remote run. Refer to"
+                " https://microsoft.github.io/Olive/features/huggingface_model_optimization.html#huggingface-login for"
+                " more details."
+            ),
+        )
+        remote_group.add_argument(
+            "--aml_compute",
+            type=str,
+            required=False,
+            help="The compute name to run the workflow on.",
+        )
+
+        return remote_group
+
+    @staticmethod
+    def _add_hf_model_options(sub_parser):
+        model_group = sub_parser.add_argument_group("model options")
+        model_group.add_argument(
+            "-m",
+            "--model_name_or_path",
+            type=str,
+            required=True,
+            help=(
+                "The model checkpoint for weights initialization. If using an AzureML Registry model, provide the model"
+                " path as 'registry_name:model_name:version'."
+            ),
+        )
+        model_group.add_argument(
+            "--trust_remote_code", action="store_true", help="Trust remote code when loading a model."
+        )
+        model_group.add_argument("-t", "--task", type=str, help="Task for which the model is used.")
+
+        return model_group
+
+    @staticmethod
+    def _add_dataconfig_options(sub_parser):
+        dataconfig_group = sub_parser.add_argument_group(
+            "data config options, which mutually exclusive with huggingface dataset options"
+        )
+        dataconfig_group.add_argument(
+            "--data_config_path",
+            type=str,
+            help="Path to the data config file. It allows to customize the data config(json/yaml) for the model.",
+        )
+
+        return dataconfig_group
+
+    @staticmethod
+    def _add_dataset_options(sub_parser):
+        dataset_group = sub_parser.add_argument_group("dataset options")
+        dataset_group.add_argument(
+            "-d",
+            "--data_name",
+            type=str,
+            required=True,
+            help="The dataset name.",
+        )
+        dataset_group.add_argument("--train_subset", type=str, help="The subset to use for training.")
+        dataset_group.add_argument("--eval_subset", type=str, help="The subset to use for evaluation.")
+        # TODO(jambayk): currently only supports single file or list of files, support mapping
+        dataset_group.add_argument(
+            "--data_files", type=str, help="The dataset files. If multiple files, separate by comma."
+        )
+        dataset_group.add_argument("--train_split", type=str, default="train", help="The split to use for training.")
+        dataset_group.add_argument(
+            "--eval_split",
+            default="",
+            help="The dataset split to evaluate on.",
+        )
+        text_group = dataset_group.add_mutually_exclusive_group(required=False)
+        text_group.add_argument(
+            "--text_field",
+            type=str,
+            help="The text field to use for fine-tuning.",
+        )
+        text_group.add_argument(
+            "--text_template",
+            # using special string type to allow for escaped characters like \n
+            type=unescaped_str,
+            help=r"Template to generate text field from. E.g. '### Question: {prompt} \n### Answer: {response}'",
+        )
+        dataset_group.add_argument(
+            "--max_seq_len",
+            type=int,
+            default=1024,
+            help="Maximum sequence length for the data.",
+        )
+        dataset_group.add_argument(
+            "--add_special_tokens",
+            type=bool,
+            default=False,
+            help="Whether to add special tokens during preprocessing.",
+        )
+        dataset_group.add_argument(
+            "--max_samples",
+            type=int,
+            default=256,
+            help="Maximum samples to select from the dataset.",
+        )
+        dataset_group.add_argument(
+            "--batch_size",
+            type=int,
+            default=1,
+            help="Batch size.",
+        )
+
+        return dataset_group, text_group
+
+    def _update_dataset_options(self, config):
+        load_key = ("data_configs", 0, "load_dataset_config")
+        preprocess_key = ("data_configs", 0, "pre_process_data_config")
+        dataloader_key = ("data_configs", 0, "dataloader_config")
+        to_replace = [
+            ((*load_key, "data_name"), self.args.data_name),
+            ((*load_key, "split"), self.args.train_split),
+            ((*load_key, "subset"), self.args.train_subset),
+            (
+                (*load_key, "data_files"),
+                self.args.data_files.split(",") if self.args.data_files else None,
+            ),
+            ((*preprocess_key, "text_cols"), self.args.text_field),
+            ((*preprocess_key, "text_template"), self.args.text_template),
+            ((*preprocess_key, "max_seq_len"), self.args.max_seq_len),
+            ((*preprocess_key, "add_special_tokens"), self.args.add_special_tokens),
+            ((*preprocess_key, "max_samples"), self.args.max_samples),
+            ((*dataloader_key, "batch_size"), self.args.batch_size),
+        ]
+        for keys, value in to_replace:
+            if value is not None:
+                set_nested_dict_value(config, keys, value)
+
+    @staticmethod
+    def _add_hf_dataset_options(sub_parser):
+        hf_dataset_group = sub_parser.add_argument_group(
+            "huggingface dataset options, if dataset options are not provided, "
+            "user should provide the following options to modify the default data config. "
+            "Please refer to olive.data.container.TransformersTokenDummyDataContainer for more details."
+        )
+        hf_dataset_group.add_argument(
+            "--hf_model_name",
+            help="Huggingface model name used to load model configs from huggingface.",
+        )
+        hf_dataset_group.add_argument(
+            "--batch_size",
+            type=int,
+            help="Batch size of the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--seq_len",
+            type=int,
+            help="Sequence length to use for the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--past_seq_len",
+            type=int,
+            help="Past sequence length to use for the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--max_seq_len",
+            type=int,
+            help="Max sequence length to use for the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--shared_kv",
+            action="store_true",
+            help="Whether to enable share kv cache in the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--generative",
+            action="store_true",
+            help="Whether to enable generative mode in the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--ort_past_key_name",
+            type=str,
+            help="Past key name for the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--ort_past_value_name",
+            type=str,
+            help="Past value name for the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--trust_remote_code",
+            action="store_true",
+            help="Whether to trust remote code in the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--max_samples",
+            type=int,
+            help="Max samples to use for the input data.",
+        )
+        hf_dataset_group.add_argument(
+            "--fields_no_batch",
+            nargs="*",
+            help="List of fields that should not be batched.",
+        )
+
+        return hf_dataset_group
+
+    @staticmethod
+    def _add_accelerator_options(sub_parser):
+        accelerator_group = sub_parser.add_argument_group("accelerator group")
+
+        accelerator_group.add_argument(
+            "--device",
+            type=str,
+            default="cpu",
+            choices=["gpu", "cpu", "npu"],
+            help="Device to use for the model.",
+        )
+
+        accelerator_group.add_argument(
+            "--providers_list",
+            type=str,
+            nargs="*",
+            choices=[
+                "CUDAExecutionProvider",
+                "DmlExecutionProvider",
+                "JsExecutionProvider",
+                "MIGraphXExecutionProvider",
+                "OpenVINOExecutionProvider",
+                "OpenVINOExecutionProvider",
+                "QNNExecutionProviderROCMExecutionProvider",
+                "TensorrtExecutionProvider",
+            ],
+            help=(
+                "List of execution providers to use for ONNX model. They are case sensitive. "
+                "If not provided, all available providers will be used."
+            ),
+        )
+
+        return accelerator_group
+
+    def _update_accelerator_options(self, config):
+        to_replace = [
+            (("systems", "local_system", "accelerators", 0, "device"), self.args.device),
+        ]
+
+        if self.args.providers_list:
+            to_replace.append(
+                (("systems", "local_system", "accelerators", 0, "execution_providers"), self.args.providers_list)
             )
 
-        config["azureml_client"] = {
-            "subscription_id": subscription_id,
-            "resource_group": args.resource_group,
-            "workspace_name": args.workspace_name,
-            "keyvault_name": args.keyvault_name,
-            "default_auth_params": {"exclude_managed_identity_credential": True},
-        }
-
-        conda_file_path = Path(tempdir) / "conda_gpu.yaml"
-        with open(conda_file_path, "w") as f:
-            yaml.dump(CONDA_CONFIG, f)
-
-        config["systems"]["aml_system"] = {
-            "type": "AzureML",
-            "accelerators": [{"device": "GPU", "execution_providers": ["CUDAExecutionProvider"]}],
-            "aml_compute": args.aml_compute,
-            "aml_docker_config": {
-                "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04",
-                "conda_file_path": str(conda_file_path),
-            },
-            "hf_token": bool(args.keyvault_name),
-        }
-        config["workflow_host"] = "aml_system"
-
-
-# TODO(team): Remove this function once the output structure is refactored
-def get_output_model_number(outputs: Dict) -> int:
-    return sum(len(f.nodes) for f in outputs.values())
+        for k, v in to_replace:
+            if v is not None:
+                set_nested_dict_value(config, k, v)
+
+    def _is_remote_run(self):
+        return all([self.args.resource_group, self.args.workspace_name, self.args.aml_compute])
+
+    def _update_remote_option(self, config, cli_action, tempdir):
+        if self.args.resource_group or self.args.workspace_name or self.args.aml_compute:
+            if not self._is_remote_run():
+                raise ValueError("resource_group, workspace_name and aml_compute are required for remote workflow run.")
+
+            config["workflow_id"] = f"{cli_action}-{hash_dict(config)}"
+
+            try:
+                subscription_id = json.loads(subprocess.check_output("az account show", shell=True).decode("utf-8"))[
+                    "id"
+                ]
+                print(f"Using Azure subscription ID: {subscription_id}")
+
+            except subprocess.CalledProcessError:
+                print(
+                    "Error: Unable to retrieve account information. "
+                    "Make sure you are logged in to Azure CLI with command `az login`."
+                )
+
+            config["azureml_client"] = {
+                "subscription_id": subscription_id,
+                "resource_group": self.args.resource_group,
+                "workspace_name": self.args.workspace_name,
+                "keyvault_name": self.args.keyvault_name,
+                "default_auth_params": {"exclude_managed_identity_credential": True},
+            }
+
+            conda_file_path = Path(tempdir) / "conda_gpu.yaml"
+            with open(conda_file_path, "w") as f:
+                yaml.dump(CONDA_CONFIG, f)
+
+            config["systems"]["aml_system"] = {
+                "type": "AzureML",
+                "accelerators": [{"device": "GPU", "execution_providers": ["CUDAExecutionProvider"]}],
+                "aml_compute": self.args.aml_compute,
+                "aml_docker_config": {
+                    "base_image": "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04",
+                    "conda_file_path": str(conda_file_path),
+                },
+                "hf_token": bool(self.args.keyvault_name),
+            }
+            config["workflow_host"] = "aml_system"
+
+    # TODO(team): Remove this function once the output structure is refactored
+    @staticmethod
+    def _get_output_model_number(outputs: Dict) -> int:
+        return sum(len(f.nodes) for f in outputs.values())
diff --git a/olive/cli/capture_onnx.py b/olive/cli/capture_onnx.py
index 1454abb5d..e6b209c59 100644
--- a/olive/cli/capture_onnx.py
+++ b/olive/cli/capture_onnx.py
@@ -11,16 +11,7 @@
 from pathlib import Path
 from typing import ClassVar, Dict
 
-from olive.cli.base import (
-    BaseOliveCLICommand,
-    add_hf_model_options,
-    add_logging_options,
-    add_remote_options,
-    get_model_name_or_path,
-    get_output_model_number,
-    is_remote_run,
-    update_remote_option,
-)
+from olive.cli.base import BaseOliveCLICommand
 from olive.common.utils import IntEnumBase, hardlink_copy_dir, set_nested_dict_value, set_tempdir
 
 
@@ -41,10 +32,10 @@ def register_subcommand(parser: ArgumentParser):
             help=("Capture ONNX graph using PyTorch Exporter or Model Builder from the Huggingface model."),
         )
 
-        add_logging_options(sub_parser)
+        CaptureOnnxGraphCommand._add_logging_options(sub_parser)
 
         # model options
-        add_hf_model_options(sub_parser)
+        CaptureOnnxGraphCommand._add_hf_model_options(sub_parser)
 
         sub_parser.add_argument(
             "--device",
@@ -153,7 +144,7 @@ def register_subcommand(parser: ArgumentParser):
         )
 
         # remote options
-        add_remote_options(sub_parser)
+        CaptureOnnxGraphCommand._add_remote_options(sub_parser)
 
         sub_parser.set_defaults(func=CaptureOnnxGraphCommand)
 
@@ -167,18 +158,18 @@ def run(self):
 
             output = olive_run(run_config)
 
-            if is_remote_run(self.args):
+            if self._is_remote_run():
                 # TODO(jambayk): point user to datastore with outputs or download outputs
                 # both are not implemented yet
                 return
 
-            if get_output_model_number(output) > 0:
+            if CaptureOnnxGraphCommand._get_output_model_number(output) > 0:
                 output_path = Path(self.args.output_path)
                 output_path.mkdir(parents=True, exist_ok=True)
                 pass_name = "m" if self.args.use_model_builder else "c"
                 device_name = "gpu-cuda_model" if self.args.device == "gpu" else "cpu-cpu_model"
                 hardlink_copy_dir(Path(tempdir) / pass_name / device_name, output_path)
-                print("ONNX Model is saved to %s", output_path.resolve())
+                print(f"ONNX Model is saved to {output_path.resolve()}")
             else:
                 print("Failed to run capture-onnx-graph. Please set the log_level to 1 for more detailed logs.")
 
@@ -191,7 +182,10 @@ def get_run_config(self, tempdir: str) -> Dict:
         to_replace = [
             ("output_dir", tempdir),
             ("log_severity_level", self.args.log_level),
-            (("input_model", "model_path"), get_model_name_or_path(self.args.model_name_or_path)),
+            (
+                ("input_model", "model_path"),
+                CaptureOnnxGraphCommand._get_model_name_or_path(self.args.model_name_or_path),
+            ),
             (("input_model", "load_kwargs", "trust_remote_code"), self.args.trust_remote_code),
             (("systems", "local_system", "accelerators", 0, "device"), self.args.device),
             (
@@ -231,7 +225,7 @@ def get_run_config(self, tempdir: str) -> Dict:
             if value is None:
                 continue
             set_nested_dict_value(config, keys, value)
-        update_remote_option(config, self.args, "capture-onnx-graph", tempdir)
+        self._update_remote_option(config, "capture-onnx-graph", tempdir)
 
         return config
 
diff --git a/olive/cli/export_adapters.py b/olive/cli/export_adapters.py
index dd7241f7e..2673c4ea4 100644
--- a/olive/cli/export_adapters.py
+++ b/olive/cli/export_adapters.py
@@ -136,7 +136,7 @@ def run(self):
             )
 
         output_path = save_weights(transformed_weights, self.args.output_path, self.args.save_format)
-        print("Exported adapter weights to %s", output_path)
+        print(f"Exported adapter weights to {output_path}")
 
         return output_path
 
diff --git a/olive/cli/finetune.py b/olive/cli/finetune.py
index a9506387e..e88968199 100644
--- a/olive/cli/finetune.py
+++ b/olive/cli/finetune.py
@@ -11,17 +11,8 @@
 from pathlib import Path
 from typing import ClassVar, Dict
 
-from olive.cli.base import (
-    BaseOliveCLICommand,
-    add_hf_model_options,
-    add_logging_options,
-    add_remote_options,
-    get_model_name_or_path,
-    get_output_model_number,
-    is_remote_run,
-    update_remote_option,
-)
-from olive.common.utils import hardlink_copy_dir, set_nested_dict_value, set_tempdir, unescaped_str
+from olive.cli.base import BaseOliveCLICommand
+from olive.common.utils import hardlink_copy_dir, set_nested_dict_value, set_tempdir
 
 
 class FineTuneCommand(BaseOliveCLICommand):
@@ -37,7 +28,7 @@ def register_subcommand(parser: ArgumentParser):
             ),
         )
 
-        add_logging_options(sub_parser)
+        FineTuneCommand._add_logging_options(sub_parser)
 
         # TODO(jambayk): option to list/install required dependencies?
         sub_parser.add_argument(
@@ -49,7 +40,7 @@ def register_subcommand(parser: ArgumentParser):
         )
 
         # Model options
-        add_hf_model_options(sub_parser)
+        FineTuneCommand._add_hf_model_options(sub_parser)
 
         sub_parser.add_argument(
             "--torch_dtype",
@@ -63,42 +54,8 @@ def register_subcommand(parser: ArgumentParser):
         )
 
         # Dataset options
-        dataset_group = sub_parser.add_argument_group("dataset options")
-        dataset_group.add_argument(
-            "-d",
-            "--data_name",
-            type=str,
-            required=True,
-            help="The dataset name.",
-        )
-        # TODO(jambayk): currently only supports single file or list of files, support mapping
-        dataset_group.add_argument(
-            "--data_files", type=str, help="The dataset files. If multiple files, separate by comma."
-        )
-        dataset_group.add_argument("--train_split", type=str, default="train", help="The split to use for training.")
-        dataset_group.add_argument(
-            "--eval_split",
-            default="",
-            help="The dataset split to evaluate on.",
-        )
-        text_group = dataset_group.add_mutually_exclusive_group(required=True)
-        text_group.add_argument(
-            "--text_field",
-            type=str,
-            help="The text field to use for fine-tuning.",
-        )
-        text_group.add_argument(
-            "--text_template",
-            # using special string type to allow for escaped characters like \n
-            type=unescaped_str,
-            help=r"Template to generate text field from. E.g. '### Question: {prompt} \n### Answer: {response}'",
-        )
-        dataset_group.add_argument(
-            "--max_seq_len",
-            type=int,
-            default=1024,
-            help="Maximum sequence length for the data.",
-        )
+        FineTuneCommand._add_dataset_options(sub_parser)
+
         # LoRA options
         lora_group = sub_parser.add_argument_group("lora options")
         lora_group.add_argument(
@@ -134,7 +91,7 @@ def register_subcommand(parser: ArgumentParser):
         sub_parser.add_argument("--clean", action="store_true", help="Run in a clean cache directory")
 
         # remote options
-        add_remote_options(sub_parser)
+        FineTuneCommand._add_remote_options(sub_parser)
 
         sub_parser.set_defaults(func=FineTuneCommand)
 
@@ -148,17 +105,17 @@ def run(self):
 
             output = olive_run(run_config)
 
-            if is_remote_run(self.args):
+            if self._is_remote_run():
                 # TODO(jambayk): point user to datastore with outputs or download outputs
                 # both are not implemented yet
                 return
 
-            if get_output_model_number(output) > 0:
+            if FineTuneCommand._get_output_model_number(output) > 0:
                 # need to improve the output structure of olive run
                 output_path = Path(self.args.output_path)
                 output_path.mkdir(parents=True, exist_ok=True)
                 hardlink_copy_dir(Path(tempdir) / "-".join(run_config["passes"].keys()) / "gpu-cuda_model", output_path)
-                print("Model and adapters saved to %s", output_path.resolve())
+                print(f"Model and adapters saved to {output_path.resolve()}")
             else:
                 print("Failed to run finetune. Please set the log_level to 1 for more detailed logs.")
 
@@ -178,21 +135,11 @@ def parse_training_args(self) -> Dict:
         return {k: v for k, v in vars(training_args).items() if k in arg_keys}
 
     def get_run_config(self, tempdir: str) -> Dict:
-        load_key = ("data_configs", 0, "load_dataset_config")
-        preprocess_key = ("data_configs", 0, "pre_process_data_config")
+
         finetune_key = ("passes", "f")
-        model_path = get_model_name_or_path(self.args.model_name_or_path)
+        model_path = FineTuneCommand._get_model_name_or_path(self.args.model_name_or_path)
         to_replace = [
             (("input_model", "model_path"), model_path),
-            ((*load_key, "data_name"), self.args.data_name),
-            ((*load_key, "split"), self.args.train_split),
-            (
-                (*load_key, "data_files"),
-                self.args.data_files.split(",") if self.args.data_files else None,
-            ),
-            ((*preprocess_key, "text_cols"), self.args.text_field),
-            ((*preprocess_key, "text_template"), self.args.text_template),
-            ((*preprocess_key, "max_seq_len"), self.args.max_seq_len),
             ((*finetune_key, "type"), self.args.method),
             ((*finetune_key, "torch_dtype"), self.args.torch_dtype),
             ((*finetune_key, "training_args"), self.parse_training_args()),
@@ -210,10 +157,11 @@ def get_run_config(self, tempdir: str) -> Dict:
             to_replace.append(((*finetune_key, "target_modules"), self.args.target_modules.split(",")))
 
         config = deepcopy(TEMPLATE)
+        self._update_dataset_options(config)
+
         for keys, value in to_replace:
-            if value is None:
-                continue
-            set_nested_dict_value(config, keys, value)
+            if value is not None:
+                set_nested_dict_value(config, keys, value)
 
         if self.args.eval_split:
             eval_data_config = deepcopy(config["data_configs"][0])
@@ -225,7 +173,7 @@ def get_run_config(self, tempdir: str) -> Dict:
         if not self.args.use_ort_genai:
             del config["passes"]["m"]
 
-        update_remote_option(config, self.args, "finetune", tempdir)
+        self._update_remote_option(config, "finetune", tempdir)
         config["log_severity_level"] = self.args.log_level
 
         return config
@@ -247,6 +195,8 @@ def get_run_config(self, tempdir: str) -> Dict:
             "type": "HuggingfaceContainer",
             "load_dataset_config": {},
             "pre_process_data_config": {},
+            "dataloader_config": {},
+            "post_process_data_config": {},
         }
     ],
     "passes": {
diff --git a/olive/cli/launcher.py b/olive/cli/launcher.py
index e758f9dfc..f34c14ad3 100644
--- a/olive/cli/launcher.py
+++ b/olive/cli/launcher.py
@@ -13,6 +13,7 @@
 from olive.cli.finetune import FineTuneCommand
 from olive.cli.manage_aml_compute import ManageAMLComputeCommand
 from olive.cli.perf_tuning import PerfTuningCommand
+from olive.cli.quantize import QuantizeCommand
 from olive.cli.run import WorkflowRunCommand
 
 
@@ -33,6 +34,7 @@ def get_cli_parser(called_as_console_script: bool = True) -> ArgumentParser:
     ConfigureQualcommSDKCommand.register_subcommand(commands_parser)
     ManageAMLComputeCommand.register_subcommand(commands_parser)
     PerfTuningCommand.register_subcommand(commands_parser)
+    QuantizeCommand.register_subcommand(commands_parser)
     CloudCacheCommand.register_subcommand(commands_parser)
 
     return parser
diff --git a/olive/cli/manage_aml_compute.py b/olive/cli/manage_aml_compute.py
index f054b07ce..832bc8947 100644
--- a/olive/cli/manage_aml_compute.py
+++ b/olive/cli/manage_aml_compute.py
@@ -68,7 +68,7 @@ def run(self):
         )
 
         if self.args.create:
-            print("Creating compute %s...", self.args.compute_name)
+            print(f"Creating compute {self.args.compute_name}...")
             if self.args.vm_size is None:
                 raise ValueError("vm_size must be provided if operation is create")
             if self.args.location is None:
@@ -84,19 +84,15 @@ def run(self):
             )
             ml_client.begin_create_or_update(cluster_basic).result()
             print(
-                "Successfully created compute: %s at %s with vm_size:%s and "
-                "min_nodes=%d and max_nodes=%d and idle_time_before_scale_down=%d",
-                self.args.compute_name,
-                self.args.location,
-                self.args.vm_size,
-                self.args.min_nodes,
-                self.args.max_nodes,
-                self.args.idle_time_before_scale_down,
+                f"Successfully created compute: {self.args.compute_name} at {self.args.location} "
+                f"with vm_size:{self.args.vm_size} and min_nodes={self.args.min_nodes} and "
+                f"max_nodes={self.args.max_nodes} and "
+                f"idle_time_before_scale_down={self.args.idle_time_before_scale_down}"
             )
         elif self.args.delete:
-            print("Deleting compute %s...", self.args.compute_name)
+            print(f"Deleting compute {self.args.compute_name}...")
             ml_client.compute.begin_delete(self.args.compute_name).wait()
-            print("Successfully deleted compute: %s", self.args.compute_name)
+            print(f"Successfully deleted compute: {self.args.compute_name}")
 
     @classmethod
     def get_ml_client(cls, aml_config_path: str, subscription_id: str, resource_group: str, workspace_name: str):
diff --git a/olive/cli/perf_tuning.py b/olive/cli/perf_tuning.py
index aa499dba5..7516a3e8b 100644
--- a/olive/cli/perf_tuning.py
+++ b/olive/cli/perf_tuning.py
@@ -15,14 +15,7 @@
 import yaml
 
 from olive.auto_optimizer.template_mapping import PERF_TUNING_TEMPLATE
-from olive.cli.base import (
-    BaseOliveCLICommand,
-    add_logging_options,
-    add_remote_options,
-    get_output_model_number,
-    is_remote_run,
-    update_remote_option,
-)
+from olive.cli.base import BaseOliveCLICommand
 from olive.common.utils import set_nested_dict_value, set_tempdir
 from olive.data.config import DataConfig
 from olive.workflows import run as olive_run
@@ -41,7 +34,7 @@ def register_subcommand(parser: ArgumentParser):
                 "--hf_model_name hf_model_name --device device_type to get the tuned session parameters."
             ),
         )
-        add_logging_options(sub_parser)
+        PerfTuningCommand._add_logging_options(sub_parser)
 
         # model options
         model_group = sub_parser.add_argument_group("model options")
@@ -52,95 +45,20 @@ def register_subcommand(parser: ArgumentParser):
         )
 
         # dataset options
-        dataset_group = sub_parser.add_argument_group(
-            "dataset options, which mutually exclusive with huggingface dataset options"
-        )
-        dataset_group.add_argument(
-            "--data_config_path",
-            type=str,
-            help="Path to the data config file. It allows to customize the data config(json/yaml) for the model.",
-        )
-
-        hf_dataset_group = sub_parser.add_argument_group(
-            "huggingface dataset options, if dataset options are not provided, "
-            "user should provide the following options to modify the default data config. "
-            "Please refer to olive.data.container.TransformersTokenDummyDataContainer for more details."
-        )
+        PerfTuningCommand._add_dataconfig_options(sub_parser)
+        hf_dataset_group = PerfTuningCommand._add_hf_dataset_options(sub_parser)
         hf_dataset_group.add_argument(
             "--predict_with_kv_cache",
             action="store_true",
             help="Whether to use key-value cache for perf_tuning",
         )
-        hf_dataset_group.add_argument(
-            "--hf_model_name",
-            help="Huggingface model name used to load model configs from huggingface.",
-        )
-        hf_dataset_group.add_argument(
-            "--batch_size",
-            type=int,
-            help="Batch size of the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--seq_len",
-            type=int,
-            help="Sequence length to use for the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--past_seq_len",
-            type=int,
-            help="Past sequence length to use for the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--max_seq_len",
-            type=int,
-            help="Max sequence length to use for the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--shared_kv",
-            action="store_true",
-            help="Whether to enable share kv cache in the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--generative",
-            action="store_true",
-            help="Whether to enable generative mode in the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--ort_past_key_name",
-            type=str,
-            help="Past key name for the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--ort_past_value_name",
-            type=str,
-            help="Past value name for the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--trust_remote_code",
-            action="store_true",
-            help="Whether to trust remote code in the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--max_samples",
-            type=int,
-            help="Max samples to use for the input data.",
-        )
-        hf_dataset_group.add_argument(
-            "--fields_no_batch",
-            nargs="*",
-            help="List of fields that should not be batched.",
-        )
 
         # pass options
         pass_group = sub_parser.add_argument_group("pass options")
 
-        pass_group.add_argument(
-            "--device",
-            type=str,
-            default="cpu",
-            choices=["gpu", "cpu"],
-            help="Device to use for the model.",
-        )
+        # accelerator options
+        PerfTuningCommand._add_accelerator_options(sub_parser)
+
         pass_group.add_argument(
             "--cpu_cores",
             type=int,
@@ -157,15 +75,6 @@ def register_subcommand(parser: ArgumentParser):
             action="store_true",
             help="Whether enable CUDA Graph for CUDA execution provider.",
         )
-        pass_group.add_argument(
-            "--providers_list",
-            type=str,
-            nargs="*",
-            help=(
-                "List of execution providers to use for ONNX model. They are case sensitive. "
-                "If not provided, all available providers will be used."
-            ),
-        )
         pass_group.add_argument(
             "--execution_mode_list", type=int, nargs="*", help="Parallelism list between operators."
         )
@@ -211,7 +120,7 @@ def register_subcommand(parser: ArgumentParser):
         )
 
         # remote options
-        add_remote_options(sub_parser)
+        PerfTuningCommand._add_remote_options(sub_parser)
 
         sub_parser.set_defaults(func=PerfTuningCommand)
 
@@ -302,12 +211,12 @@ def get_run_config(self, tempdir) -> Dict:
             to_replace.append((system_ep_key, self.args.providers_list))
 
         config = deepcopy(template_config)
-        for k, v in to_replace:
-            if v is None:
-                continue
-            set_nested_dict_value(config, k, v)
+        self._update_accelerator_options(config)
+        self._update_remote_option(config, "perf-tuning", tempdir)
 
-        update_remote_option(config, self.args, "perf-tuning", tempdir)
+        for k, v in to_replace:
+            if v is not None:
+                set_nested_dict_value(config, k, v)
         config["log_severity_level"] = self.args.log_level
 
         return config
@@ -320,12 +229,12 @@ def run(self):
             run_config["output_dir"] = tempdir
             output = olive_run(run_config)
 
-            if is_remote_run(self.args):
+            if self._is_remote_run():
                 # TODO(jambayk): point user to datastore with outputs or download outputs
                 # both are not implemented yet
                 return
 
-            if get_output_model_number(output) > 0:
+            if PerfTuningCommand._get_output_model_number(output) > 0:
                 # need to improve the output structure of olive run
                 output_path = Path(self.args.output_path)
                 output_path.mkdir(parents=True, exist_ok=True)
@@ -336,6 +245,6 @@ def run(self):
                     with rls_json_path.open() as f:
                         infer_settings = json.load(f)["config"]["inference_settings"]
                         json.dump(infer_settings, infer_setting_output_path.open("w"), indent=4)
-                print("Inference session parameters are saved to %s", output_path.resolve())
+                print(f"Inference session parameters are saved to {output_path.resolve()}")
             else:
                 print("Failed to run tune-session-params. Please set the log_level to 1 for more detailed logs.")
diff --git a/olive/cli/quantize.py b/olive/cli/quantize.py
new file mode 100644
index 000000000..28b9ebdf8
--- /dev/null
+++ b/olive/cli/quantize.py
@@ -0,0 +1,153 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+# ruff: noqa: T201
+# ruff: noqa: RUF012
+
+import tempfile
+from argparse import ArgumentParser
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict
+
+from olive.cli.base import BaseOliveCLICommand
+from olive.common.utils import hardlink_copy_dir, set_nested_dict_value, set_tempdir
+
+
+class QuantizeCommand(BaseOliveCLICommand):
+    _CONFIG_TEMPLATE = {
+        "input_model": {"type": "HfModel", "load_kwargs": {"attn_implementation": "eager"}},
+        "systems": {
+            "local_system": {
+                "type": "LocalSystem",
+                "accelerators": [{"device": "gpu", "execution_providers": ["CUDAExecutionProvider"]}],
+            }
+        },
+        "data_configs": [
+            {
+                "name": "default_data_config",
+                "type": "HuggingfaceContainer",
+                "load_dataset_config": {},
+                "pre_process_data_config": {},
+                "dataloader_config": {},
+                "post_process_data_config": {},
+            }
+        ],
+        "passes": {
+            "awq": {"type": "AutoAWQQuantizer"},
+            "gptq": {
+                # Ref: https://github.com/AutoGPTQ/AutoGPTQ/pull/651/files
+                "type": "GptqQuantizer",
+                "data_config": "default_data_config",
+            },
+            "quarot": {
+                "type": "QuaRot",
+                "w_rtn": True,
+                "rotate": True,
+                "w_bits": 4,
+                "a_bits": 4,
+                "k_bits": 4,
+                "v_bits": 4,
+                "calibration_data_config": None,
+            },
+        },
+        "pass_flows": [],
+        "cache_dir": "cache",
+        "output_dir": "models",
+        "host": "local_system",
+        "target": "local_system",
+    }
+
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        sub_parser = parser.add_parser(
+            "quantize",
+            help="Quantize the input model",
+        )
+
+        # Logging options
+        QuantizeCommand._add_logging_options(sub_parser)
+
+        sub_parser.add_argument(
+            "-o",
+            "--output_path",
+            type=str,
+            required=True,
+            help="Path to save quantized model weights.",
+        )
+        sub_parser.add_argument(
+            "--tempdir", default=None, type=str, help="Root directory for tempfile directories and files"
+        )
+        sub_parser.add_argument(
+            "--algorithms",
+            type=str,
+            nargs="*",
+            required=True,
+            choices=sorted(QuantizeCommand._CONFIG_TEMPLATE["passes"].keys()),
+            help="List of quantization algorithms to run.",
+        )
+
+        # model options
+        QuantizeCommand._add_hf_model_options(sub_parser)
+
+        # dataset options
+        QuantizeCommand._add_dataset_options(sub_parser)
+
+        # accelerator options
+        QuantizeCommand._add_accelerator_options(sub_parser)
+
+        # remote options
+        QuantizeCommand._add_remote_options(sub_parser)
+
+        sub_parser.set_defaults(func=QuantizeCommand)
+
+    def _get_run_config(self, tempdir: str) -> Dict[str, Any]:
+        to_replace = [
+            (("input_model", "model_path"), self.args.model_name_or_path),
+            (("input_model", "load_kwargs", "trust_remote_code"), self.args.trust_remote_code),
+            (("pass_flows"), [[name] for name in self.args.algorithms]),
+            (("output_dir"), tempdir),
+        ]
+
+        if self.args.task:
+            to_replace.append((("input_model", "task"), self.args.task))
+
+        config = deepcopy(QuantizeCommand._CONFIG_TEMPLATE)
+        self._update_dataset_options(config)
+        self._update_accelerator_options(config)
+        config["log_severity_level"] = self.args.log_level
+
+        for k, v in to_replace:
+            if v is not None:
+                set_nested_dict_value(config, k, v)
+
+        return config
+
+    def run(self):
+        from olive.workflows import run as olive_run
+
+        set_tempdir(self.args.tempdir)
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            run_config = self._get_run_config(tempdir)
+            output = olive_run(run_config)
+
+            if self._is_remote_run():
+                # TODO(jambayk): point user to datastore with outputs or download outputs
+                # both are not implemented yet
+                return
+
+            if QuantizeCommand._get_output_model_number(output) > 0:
+                # need to improve the output structure of olive run
+                output_path = Path(self.args.output_path)
+                output_path.mkdir(parents=True, exist_ok=True)
+                device_name = "gpu-cuda_model" if self.args.device == "gpu" else "cpu-cpu_model"
+                for algorithm_name in self.args.algorithms:
+                    hardlink_copy_dir(
+                        Path(tempdir) / algorithm_name / device_name / "model", output_path / algorithm_name
+                    )
+                print(f"Quantized models saved to {output_path.resolve()}")
+            else:
+                print("Failed to run quantize. Please set the log_level to 1 for more detailed logs.")
diff --git a/olive/common/hf/mappings.py b/olive/common/hf/mappings.py
index 723928a37..e13bf2d86 100644
--- a/olive/common/hf/mappings.py
+++ b/olive/common/hf/mappings.py
@@ -70,3 +70,16 @@
     "llama": "gpt2",
     "roberta": "bert",
 }
+
+MODEL_OUTSIDE_LAYER_MODULES = {
+    "phi3": ["model.embed_tokens", "embed_dropout", "model.norm"],
+}
+
+MODEL_INSIDE_LAYER_MODULES = {
+    "phi3": [
+        ["self_attn.qkv_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.gate_up_proj"],
+        ["mlp.down_proj"],
+    ]
+}
diff --git a/olive/passes/pytorch/gptq.py b/olive/passes/pytorch/gptq.py
index b39bae832..02e3936c3 100644
--- a/olive/passes/pytorch/gptq.py
+++ b/olive/passes/pytorch/gptq.py
@@ -11,6 +11,7 @@
 import torch
 
 from olive.common.config_utils import validate_config
+from olive.common.hf.mappings import MODEL_INSIDE_LAYER_MODULES, MODEL_OUTSIDE_LAYER_MODULES
 from olive.data.config import DataConfig
 from olive.hardware.accelerator import AcceleratorSpec, Device
 from olive.model import HfModelHandler, PyTorchModelHandler
@@ -166,20 +167,26 @@ def _run_for_config(
         def get_onnx_quant_linear(*args, **kwargs):
             return QuantLinear
 
-        if hasattr(pytorch_model, "config") and pytorch_model.config.model_type in GPTQ_CAUSAL_LM_MODEL_MAP:
-            model_type = pytorch_model.config.model_type
-            model_class = GPTQ_CAUSAL_LM_MODEL_MAP[model_type]
-            quantized_model = model_class(pytorch_model, False, quantize_config)
-        else:
-            quantized_model = BaseGPTQForCausalLM(pytorch_model, False, quantize_config)
-            if not (config["layers_block_name"] and config["outside_layer_modules"] and config["inside_layer_modules"]):
-                raise ValueError(
-                    "Can't get layers_block_name to quantize automatically, "
-                    "please set layers_block_name, outside_layer_modules and inside_layer_modules in config."
-                )
-            quantized_model.layers_block_name = config["layers_block_name"]
+        model_type = pytorch_model.config.model_type if hasattr(pytorch_model, "config") else ""
+        model_class = GPTQ_CAUSAL_LM_MODEL_MAP.get(model_type, BaseGPTQForCausalLM)
+        quantized_model = model_class(pytorch_model, False, quantize_config)
+
+        if config["inside_layer_modules"]:
             quantized_model.outside_layer_modules = config["outside_layer_modules"]
+        elif model_type in MODEL_OUTSIDE_LAYER_MODULES:
+            quantized_model.outside_layer_modules = MODEL_OUTSIDE_LAYER_MODULES[model_type]
+        else:
+            raise ValueError("Can't get outside_layer_modules to quantize automatically, please provide it in config.")
+
+        if config["inside_layer_modules"]:
             quantized_model.inside_layer_modules = config["inside_layer_modules"]
+        elif model_type in MODEL_INSIDE_LAYER_MODULES:
+            quantized_model.inside_layer_modules = MODEL_INSIDE_LAYER_MODULES[model_type]
+        else:
+            raise ValueError("Can't get inside_layer_modules to quantize automatically, please provide it in config.")
+
+        if config["layers_block_name"]:
+            quantized_model.layers_block_name = config["layers_block_name"]
 
         import auto_gptq
 
diff --git a/test/unit_test/cli/test_cli.py b/test/unit_test/cli/test_cli.py
index 393ca23d5..b58b1a21e 100644
--- a/test/unit_test/cli/test_cli.py
+++ b/test/unit_test/cli/test_cli.py
@@ -245,5 +245,47 @@ def test_cloud_cache_command(mock_container_client, test_set):
     mock_container_client().delete_blob.assert_called_once()
 
 
+@pytest.mark.parametrize("algorithm_names", [{"awq"}, {"awq", "gptq"}])
+@patch("olive.workflows.run")
+@patch("olive.cli.finetune.tempfile.TemporaryDirectory")
+def test_quantize_command(mock_tempdir, mock_run, algorithm_names, tmp_path):
+    # some directories
+    tmpdir = tmp_path / "tmpdir"
+    tmpdir.mkdir()
+
+    output_dir = tmp_path / "output_dir"
+
+    # setup
+    mock_tempdir.return_value = tmpdir.resolve()
+    mock_run.return_value = {"output_dir": Footprint(nodes={"dummy_output": "dummy_output"})}
+
+    for algo_name in algorithm_names:
+        workflow_output_dir = tmpdir / algo_name / "cpu-cpu_model" / "model"
+        workflow_output_dir.mkdir(parents=True)
+        dummy_model = workflow_output_dir / "dummy_model"
+        with dummy_model.open("w") as f:
+            f.write("dummy_model")
+
+    # setup
+    command_args = [
+        "quantize",
+        "-m",
+        "dummy_model",
+        "-d",
+        "dummy_dataset",
+        "--algorithms",
+        *algorithm_names,
+        "-o",
+        str(output_dir),
+    ]
+
+    # execute
+    cli_main(command_args)
+
+    config = mock_run.call_args[0][0]
+    assert config["input_model"]["model_path"] == "dummy_model"
+    assert {el.name for el in output_dir.iterdir()} == algorithm_names
+
+
 # TODO(anyone): Add tests for ManageAMLComputeCommand
 # Test for ExportAdaptersCommand is added as part of test/unit_test/passes/onnx/test_export_adapters.py