From 62e09528ffd42ea3f791ded96831b9cd6c3bd745 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 10 Feb 2025 23:34:03 +0000
Subject: [PATCH 1/6] hacks

---
 .../modifiers/quantization/cache.py           |  2 ++
 src/llmcompressor/observers/base.py           | 20 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py
index 5b2be2c65..9b7a62141 100644
--- a/src/llmcompressor/modifiers/quantization/cache.py
+++ b/src/llmcompressor/modifiers/quantization/cache.py
@@ -90,6 +90,7 @@ def update(
             self.k_observers.append(k_observer)
             self.v_observers.append(v_observer)
 
+        # batch x heads x seq_len x head_dim
         q_key_states = self._quantize(
             key_states.contiguous(), KVCacheScaleType.KEY, layer_idx
         )
@@ -150,6 +151,7 @@ def _quantize(self, tensor, kv_type, layer_idx):
             scales = self.v_scales
             zps = self.v_zps
 
+        # tensor
         scale, zp = observer(tensor)
         if len(scales) <= layer_idx:
             scales.append(scale)
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
index e70125908..2aa09b067 100644
--- a/src/llmcompressor/observers/base.py
+++ b/src/llmcompressor/observers/base.py
@@ -124,7 +124,25 @@ def get_qparams(
 
             elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
                 # assume observed is transposed, because its the output, hence use dim 0
-                self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0)
+                # we pass in [1, 8, 2048, 128] for k_states 
+                # normally per channel: (output_dim, 1) and you have as many scales as the output_dim
+                # we want 8 - num_k_head_scales? or 
+                #breakpoint()
+
+                # weight --> get scales along the first dimension (output dim is first dim) 
+                # weight shape (output_dim, input_dim)
+                # self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0)
+                # output when applied to the weight: (output_dim, 1)
+
+            
+                # for outputs:
+                self._scale, self._zero_point = self.get_qparams_along_dim(observed, 2)
+                self._scale = self._scale.squeeze(1)
+                self._zero_point = self._zero_point.squeeze(1)
+                # why is the output of self._scale: [1, 1, 1]
+
+
+                
 
             elif self.quantization_args.strategy == QuantizationStrategy.TOKEN:
                 # use dim 1, assume the obsersed.shape = [batch, token, hidden]

From f4e1d05d0b02e13a1445e41c501ac762dd0fb357 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Tue, 11 Feb 2025 19:35:35 +0000
Subject: [PATCH 2/6] update example

---
 .../llama3_fp8_kv_example.py                  | 43 ++++++++++++-------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 6c08d4acc..57fcb84e1 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -6,6 +6,7 @@
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
@@ -19,7 +20,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 512
+NUM_CALIBRATION_SAMPLES = 10
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -49,28 +50,40 @@ def process_and_tokenize(example):
 quant_stage:
     quant_modifiers:
         QuantizationModifier:
-            ignore: ["lm_head"]
             config_groups:
-                group_0:
-                    weights:
+                fp8_attention:
+                    output_activations:
+                        num_bits: 8
+                        type: float
+                        strategy: channel
+                        dynamic: false
+                        symmetric: true
+                    targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
+"""
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            config_groups:
+                fp8_attention_q_proj:
+                    output_activations:
                         num_bits: 8
                         type: float
-                        strategy: tensor
+                        strategy: group
+                        group_size: 512
                         dynamic: false
                         symmetric: true
-                    input_activations:
+                    targets: ['re:.*q_proj']
+                fp8_attention_kv_proj:
+                    output_activations:
                         num_bits: 8
                         type: float
-                        strategy: tensor
+                        strategy: group
+                        group_size: 128
                         dynamic: false
                         symmetric: true
-                    targets: ["Linear"]
-            kv_cache_scheme:
-                num_bits: 8
-                type: float
-                strategy: tensor
-                dynamic: false
-                symmetric: true
+                    targets: ['re:.*k_proj', 're:.*v_proj']
+
 """
 
 # Apply algorithms.
@@ -96,6 +109,6 @@ def process_and_tokenize(example):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+SAVE_DIR = MODEL_ID.split("/")[1] + "-AttnQuantOnly-Group"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)

From c2a20165770974a77d20373042beab4e12a86720 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Thu, 6 Mar 2025 23:20:31 -0500
Subject: [PATCH 3/6] channel wise fp8 quantization, attention modules
 Signed-off-by: George Ohashi <george@neuralmagic.com>

---
 .../llama3_fp8_kv_example.py                  | 52 ++++++++++---------
 .../modifiers/quantization/calibration.py     |  4 +-
 src/llmcompressor/observers/base.py           | 47 +++++++++--------
 3 files changed, 55 insertions(+), 48 deletions(-)

diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 062edc562..6aeef060c 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -58,33 +58,35 @@ def process_and_tokenize(example):
                         strategy: channel
                         dynamic: false
                         symmetric: true
-                    targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
+                    # targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
+                    targets: ['re:.*q_proj',]
+                    
 """
-recipe = """
-quant_stage:
-    quant_modifiers:
-        QuantizationModifier:
-            config_groups:
-                fp8_attention_q_proj:
-                    output_activations:
-                        num_bits: 8
-                        type: float
-                        strategy: group
-                        group_size: 512
-                        dynamic: false
-                        symmetric: true
-                    targets: ['re:.*q_proj']
-                fp8_attention_kv_proj:
-                    output_activations:
-                        num_bits: 8
-                        type: float
-                        strategy: group
-                        group_size: 128
-                        dynamic: false
-                        symmetric: true
-                    targets: ['re:.*k_proj', 're:.*v_proj']
+# recipe = """
+# quant_stage:
+#     quant_modifiers:
+#         QuantizationModifier:
+#             config_groups:
+#                 fp8_attention_q_proj:
+#                     output_activations:
+#                         num_bits: 8
+#                         type: float
+#                         strategy: channel
+#                         # group_size: 512
+#                         dynamic: false
+#                         symmetric: true
+#                     targets: ['re:.*q_proj']
+#                 # fp8_attention_kv_proj:
+#                 #     output_activations:
+#                 #         num_bits: 8
+#                 #         type: float
+#                 #         strategy: group
+#                 #         group_size: 128
+#                 #         dynamic: false
+#                 #         symmetric: true
+#                 #     targets: ['re:.*k_proj', 're:.*v_proj']
 
-"""
+# """
 
 # Apply algorithms.
 oneshot(
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
index bcb4b7433..37cca0ac8 100644
--- a/src/llmcompressor/modifiers/quantization/calibration.py
+++ b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -81,7 +81,9 @@ def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor]
         raise ValueError("Must provide a value to observe if not using weight observer")
 
     observer = getattr(module, f"{base_name}_observer")
-    updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
+    updated_scale, updated_zero_point = observer(
+        value, g_idx=g_idx, base_name=base_name
+    )
 
     # update scale and zero point
     update_parameter_data(module, updated_scale, f"{base_name}_scale")
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
index 2aa09b067..a9a107559 100644
--- a/src/llmcompressor/observers/base.py
+++ b/src/llmcompressor/observers/base.py
@@ -31,7 +31,10 @@ def __init__(self, quantization_args: QuantizationArgs):
 
     @torch.no_grad()
     def forward(
-        self, observed: Tensor, g_idx: Optional[Tensor] = None
+        self,
+        observed: Tensor,
+        g_idx: Optional[Tensor] = None,
+        base_name: Optional[str] = None,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         maps directly to get_qparams
@@ -40,8 +43,9 @@ def forward(
         :param g_idx: optional mapping from column index to group index
         :return: tuple of scale and zero point based on last observed value
         """
+        # breakpoint()
         self.record_observed_tokens(observed)
-        return self.get_qparams(observed=observed, g_idx=g_idx)
+        return self.get_qparams(observed=observed, g_idx=g_idx, base_name=base_name)
 
     def calculate_qparams(
         self,
@@ -66,6 +70,7 @@ def get_qparams(
         self,
         observed: Optional[Tensor] = None,
         g_idx: Optional[Tensor] = None,
+        base_name: Optional[str] = None,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         Convenience function to wrap overwritten calculate_qparams
@@ -123,26 +128,24 @@ def get_qparams(
                     self._zero_point[:, group_index] = zero_point.squeeze(1)
 
             elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
-                # assume observed is transposed, because its the output, hence use dim 0
-                # we pass in [1, 8, 2048, 128] for k_states 
-                # normally per channel: (output_dim, 1) and you have as many scales as the output_dim
-                # we want 8 - num_k_head_scales? or 
-                #breakpoint()
-
-                # weight --> get scales along the first dimension (output dim is first dim) 
-                # weight shape (output_dim, input_dim)
-                # self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0)
-                # output when applied to the weight: (output_dim, 1)
-
-            
-                # for outputs:
-                self._scale, self._zero_point = self.get_qparams_along_dim(observed, 2)
-                self._scale = self._scale.squeeze(1)
-                self._zero_point = self._zero_point.squeeze(1)
-                # why is the output of self._scale: [1, 1, 1]
-
-
-                
+                if base_name == "output":
+                    # the last dimension is the hidden dimension
+                    # shape of [1,1, num_key_value_heads * head_dim]
+                    scale, zero_point = self.get_qparams_along_dim(
+                        observed, observed.ndim - 1
+                    )
+                    self._scale = (
+                        scale.squeeze()
+                    )  # shape of [num_key_value_heads * head_dim]
+                    self._zero_point = (
+                        zero_point.squeeze()
+                    )  # shape of [num_key_value_heads * head_dim]
+                else:
+                    # weight or input
+                    # assume observed is transposed, because its the output, hence use dim 0
+                    self._scale, self._zero_point = self.get_qparams_along_dim(
+                        observed, 0
+                    )
 
             elif self.quantization_args.strategy == QuantizationStrategy.TOKEN:
                 # use dim 1, assume the obsersed.shape = [batch, token, hidden]

From 189e9d5e7c490700b1aa20eb54a77c2283febe94 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Thu, 6 Mar 2025 23:22:49 -0500
Subject: [PATCH 4/6] revert example script Signed-off-by: George Ohashi
 <george@neuralmagic.com>

---
 .../llama3_fp8_kv_example.py                  | 53 +++++++------------
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
index 6aeef060c..72872b913 100644
--- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py
+++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -6,7 +6,6 @@
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
@@ -20,7 +19,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 10
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -50,43 +49,29 @@ def process_and_tokenize(example):
 quant_stage:
     quant_modifiers:
         QuantizationModifier:
+            ignore: ["lm_head"]
             config_groups:
-                fp8_attention:
-                    output_activations:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
                         num_bits: 8
                         type: float
-                        strategy: channel
+                        strategy: tensor
                         dynamic: false
                         symmetric: true
-                    # targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
-                    targets: ['re:.*q_proj',]
-                    
+                    targets: ["Linear"]
+            kv_cache_scheme:
+                num_bits: 8
+                type: float
+                strategy: tensor
+                dynamic: false
+                symmetric: true
 """
-# recipe = """
-# quant_stage:
-#     quant_modifiers:
-#         QuantizationModifier:
-#             config_groups:
-#                 fp8_attention_q_proj:
-#                     output_activations:
-#                         num_bits: 8
-#                         type: float
-#                         strategy: channel
-#                         # group_size: 512
-#                         dynamic: false
-#                         symmetric: true
-#                     targets: ['re:.*q_proj']
-#                 # fp8_attention_kv_proj:
-#                 #     output_activations:
-#                 #         num_bits: 8
-#                 #         type: float
-#                 #         strategy: group
-#                 #         group_size: 128
-#                 #         dynamic: false
-#                 #         symmetric: true
-#                 #     targets: ['re:.*k_proj', 're:.*v_proj']
-
-# """
 
 # Apply algorithms.
 oneshot(
@@ -111,6 +96,6 @@ def process_and_tokenize(example):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-AttnQuantOnly-Group"
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)

From 78222babd6a29bf85f409a587a2aa098b8bc9823 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Thu, 6 Mar 2025 23:36:00 -0500
Subject: [PATCH 5/6] lint Signed-off-by: George Ohashi
 <george@neuralmagic.com>

---
 src/llmcompressor/observers/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
index a9a107559..fd8b977a0 100644
--- a/src/llmcompressor/observers/base.py
+++ b/src/llmcompressor/observers/base.py
@@ -142,7 +142,8 @@ def get_qparams(
                     )  # shape of [num_key_value_heads * head_dim]
                 else:
                     # weight or input
-                    # assume observed is transposed, because its the output, hence use dim 0
+                    # assume observed is transposed,
+                    # because its the output, hence use dim 0
                     self._scale, self._zero_point = self.get_qparams_along_dim(
                         observed, 0
                     )

From 5d13e2b7e5eb9c6aba368533a704e2ef598e85d8 Mon Sep 17 00:00:00 2001
From: George Ohashi <george@neuralmagic.com>
Date: Fri, 7 Mar 2025 09:28:36 -0500
Subject: [PATCH 6/6] kv-cache int8 quant Signed-off-by: George Ohashi
 <george@neuralmagic.com>

---
 src/llmcompressor/modifiers/quantization/cache.py | 11 +++++++++--
 src/llmcompressor/observers/base.py               |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py
index 9b7a62141..4d9f48e62 100644
--- a/src/llmcompressor/modifiers/quantization/cache.py
+++ b/src/llmcompressor/modifiers/quantization/cache.py
@@ -151,8 +151,15 @@ def _quantize(self, tensor, kv_type, layer_idx):
             scales = self.v_scales
             zps = self.v_zps
 
-        # tensor
-        scale, zp = observer(tensor)
+        # note: key, value states are in the shape:
+        # [batch, num_key_value_heads, seq_len, head_dim]
+
+        base_name = None  # tensor-wise quantization, shape of [1]
+        if self.quantization_args.strategy == "channel":
+            # target last dim to quantize, shape of [head_dim]
+            base_name = "kv_cache"
+
+        scale, zp = observer(tensor, base_name=base_name)
         if len(scales) <= layer_idx:
             scales.append(scale)
             zps.append(zp)
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
index fd8b977a0..9bc030b52 100644
--- a/src/llmcompressor/observers/base.py
+++ b/src/llmcompressor/observers/base.py
@@ -128,7 +128,7 @@ def get_qparams(
                     self._zero_point[:, group_index] = zero_point.squeeze(1)
 
             elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
-                if base_name == "output":
+                if base_name in ("output", "kv_cache"):
                     # the last dimension is the hidden dimension
                     # shape of [1,1, num_key_value_heads * head_dim]
                     scale, zero_point = self.get_qparams_along_dim(