microsoft · kunal-vaishnavi · Nov 1, 2024 · Sep 27, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -40,10 +40,10 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.io_dtype = io_dtype      # {'fp16', 'fp32'}
         self.onnx_dtype = onnx_dtype  # {"int4", "fp16", "fp32"}
         self.quant_type = config.quantization_config["quant_method"] if hasattr(config, "quantization_config") else None
-        self.adapter_path = extra_options["adapter_path"] if "adapter_path" in extra_options else None
+        self.adapter_path = extra_options.get("adapter_path", None)
 
         self.cache_dir = cache_dir
-        self.filename = extra_options["filename"] if "filename" in extra_options else "model.onnx"
+        self.filename = extra_options.get("filename", "model.onnx")
         self.hf_token = parse_hf_token(extra_options.get("hf_token", "true"))
         self.extra_options = extra_options
 
@@ -54,7 +54,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.nodes = []
 
         # EP-specific variables
-        enable_cuda_graph = "1" if "enable_cuda_graph" in extra_options else "0"
+        enable_cuda_graph = extra_options.get("enable_cuda_graph", "0")
         self.ep = ep
         self.ep_attrs = {
             "cpu": {},
@@ -288,8 +288,9 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         # Quantization-specific variables (INT4, INT8, etc.)
         self.quant_attrs = {
             "int4": {
-                "block_size": int(extra_options["int4_block_size"]) if "int4_block_size" in extra_options else 32,
-                "accuracy_level": int(extra_options["int4_accuracy_level"]) if "int4_accuracy_level" in extra_options else 0,   # Default is 0 for non-QDQ formats, default is 4 for QDQ formats
+                "accuracy_level": int(extra_options.get("int4_accuracy_level", 0)),   # Default is 0 for non-QDQ formats, default is 4 for QDQ formats
+                "block_size": int(extra_options.get("int4_block_size", 32)),
+                "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )),
             },
             "use_qdq": False,           # Use QDQ format
         }
@@ -401,7 +402,8 @@ def save_model(self, out_dir):
 
         # Quantize ONNX model to desired precision
         # TODO: Replace by quantizing the MatMuls as they are created
-        if self.onnx_dtype == "int4" and self.quant_type is None:
+        already_quantized_in_qdq_format = self.quant_type is not None and self.quant_attrs["use_qdq"]  # Skip quantizing `MatMul` in `DequantizeLinear --> Transpose --> MatMul` path
+        if self.onnx_dtype == "int4" and not already_quantized_in_qdq_format:
             model = self.to_int4(model)
 
         # Save ONNX model with only one external data file and delete any existing duplicate copies
@@ -432,6 +434,7 @@ def to_int4(self, model):
             accuracy_level=self.quant_attrs["int4"]["accuracy_level"],
             nodes_to_exclude=[],
             quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator,
+            op_types_to_quantize=self.quant_attrs["int4"]["op_types_to_quantize"],
         )
         quant.process()
         return quant.model.model
@@ -3004,6 +3007,12 @@ def make_layer(self, layer_id, layer):
         super().make_layer(layer_id, layer)
 
 def check_extra_options(kv_pairs):
+    if "int4_op_types_to_quantize" in kv_pairs:
+        op_types_to_quantize = ()
+        for op_type in kv_pairs["int4_op_types_to_quantize"].split("/"):
+            op_types_to_quantize += (op_type, )
+        kv_pairs["int4_op_types_to_quantize"] = op_types_to_quantize
+
     if "use_8bits_moe" in kv_pairs:
         assert(kv_pairs["use_8bits_moe"] == "1" or kv_pairs["use_8bits_moe"] == "0"), "use_8bits_moe must be 0 or 1."
 
@@ -3181,12 +3190,15 @@ def get_args():
         nargs='+',
         help=textwrap.dedent("""\
             Key value pairs for various options. Currently supports:
-                int4_block_size = 16/32/64/128/256: Specify the block_size for int4 quantization.
                 int4_accuracy_level = 1/2/3/4: Specify the minimum accuracy level for activation of MatMul in int4 quantization.
                     4 is int8, which means input A of int4 quantized MatMul is quantized to int8 and input B is upcasted to int8 for computation.
                     3 is bf16.
                     2 is fp16.
                     1 is fp32.
+                int4_block_size = 16/32/64/128/256: Specify the block_size for int4 quantization.
+                int4_op_types_to_quantize = MatMul/Gather: Specify op types to target for int4 quantization.
+                    Use this option when you want to quantize specific ops.
+                    Separate the op types with a '/' when passing them here (e.g. int4_op_types_to_quantize=MatMul/Gather)
                 num_hidden_layers = Manually specify the number of layers in your ONNX model (for unit testing purposes).
                 filename = Filename for ONNX model (default is 'model.onnx').
                     For models with multiple components, each component is exported to its own ONNX model.
@@ -3199,13 +3211,13 @@ def get_args():
                 exclude_lm_head = Remove language modeling head from your ONNX model.
                     Use this option when you want to remove the language modeling head from within your ONNX model.
                     Instead of `logits`, you will have `hidden_states` as the output to your ONNX model.
-                enable_cuda_graph = 1 : The model can use CUDA graph capture for CUDA execution provider. If enabled, all nodes being placed on the CUDA EP
+                enable_cuda_graph = 1: The model can use CUDA graph capture for CUDA execution provider. If enabled, all nodes being placed on the CUDA EP
                     is the prerequisite for the CUDA graph to be used correctly. It is not guaranteed that cuda graph be enabled as it depends on the model
                     and the graph structure.
-                use_8bits_moe = 1 : Use 8-bit quantization for MoE layers. Default is using 4-bit quantization.
+                use_8bits_moe = 1: Use 8-bit quantization for MoE layers. Default is using 4-bit quantization.
                 hf_token = false/token: Use this to disable authentication with Hugging Face or provide a custom authentication token that differs from the one stored in your environment. Default behavior is to use the authentication token stored by `huggingface-cli login`.
                     If you have already authenticated via `huggingface-cli login`, you do not need to use this flag because Hugging Face has already stored your authentication token for you.
-                use_qdq = 1 : Use the QDQ decomposition for quantized MatMul instead of the MatMulNBits operator.
+                use_qdq = 1: Use the QDQ decomposition for quantized MatMul instead of the MatMulNBits operator.
                 adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights).
             """),
     )