From 62e09528ffd42ea3f791ded96831b9cd6c3bd745 Mon Sep 17 00:00:00 2001 From: Dipika Date: Mon, 10 Feb 2025 23:34:03 +0000 Subject: [PATCH 1/6] hacks --- .../modifiers/quantization/cache.py | 2 ++ src/llmcompressor/observers/base.py | 20 ++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py index 5b2be2c65..9b7a62141 100644 --- a/src/llmcompressor/modifiers/quantization/cache.py +++ b/src/llmcompressor/modifiers/quantization/cache.py @@ -90,6 +90,7 @@ def update( self.k_observers.append(k_observer) self.v_observers.append(v_observer) + # batch x heads x seq_len x head_dim q_key_states = self._quantize( key_states.contiguous(), KVCacheScaleType.KEY, layer_idx ) @@ -150,6 +151,7 @@ def _quantize(self, tensor, kv_type, layer_idx): scales = self.v_scales zps = self.v_zps + # tensor scale, zp = observer(tensor) if len(scales) <= layer_idx: scales.append(scale) diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index e70125908..2aa09b067 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -124,7 +124,25 @@ def get_qparams( elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL: # assume observed is transposed, because its the output, hence use dim 0 - self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0) + # we pass in [1, 8, 2048, 128] for k_states + # normally per channel: (output_dim, 1) and you have as many scales as the output_dim + # we want 8 - num_k_head_scales? or + #breakpoint() + + # weight --> get scales along the first dimension (output dim is first dim) + # weight shape (output_dim, input_dim) + # self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0) + # output when applied to the weight: (output_dim, 1) + + + # for outputs: + self._scale, self._zero_point = self.get_qparams_along_dim(observed, 2) + self._scale = self._scale.squeeze(1) + self._zero_point = self._zero_point.squeeze(1) + # why is the output of self._scale: [1, 1, 1] + + + elif self.quantization_args.strategy == QuantizationStrategy.TOKEN: # use dim 1, assume the obsersed.shape = [batch, token, hidden] From f4e1d05d0b02e13a1445e41c501ac762dd0fb357 Mon Sep 17 00:00:00 2001 From: Dipika Date: Tue, 11 Feb 2025 19:35:35 +0000 Subject: [PATCH 2/6] update example --- .../llama3_fp8_kv_example.py | 43 ++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 6c08d4acc..57fcb84e1 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -6,6 +6,7 @@ # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", @@ -19,7 +20,7 @@ # Select number of samples. 512 samples is a good place to start. # Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 512 +NUM_CALIBRATION_SAMPLES = 10 MAX_SEQUENCE_LENGTH = 2048 # Load dataset and preprocess. @@ -49,28 +50,40 @@ def process_and_tokenize(example): quant_stage: quant_modifiers: QuantizationModifier: - ignore: ["lm_head"] config_groups: - group_0: - weights: + fp8_attention: + output_activations: + num_bits: 8 + type: float + strategy: channel + dynamic: false + symmetric: true + targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] +""" +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + config_groups: + fp8_attention_q_proj: + output_activations: num_bits: 8 type: float - strategy: tensor + strategy: group + group_size: 512 dynamic: false symmetric: true - input_activations: + targets: ['re:.*q_proj'] + fp8_attention_kv_proj: + output_activations: num_bits: 8 type: float - strategy: tensor + strategy: group + group_size: 128 dynamic: false symmetric: true - targets: ["Linear"] - kv_cache_scheme: - num_bits: 8 - type: float - strategy: tensor - dynamic: false - symmetric: true + targets: ['re:.*k_proj', 're:.*v_proj'] + """ # Apply algorithms. @@ -96,6 +109,6 @@ def process_and_tokenize(example): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +SAVE_DIR = MODEL_ID.split("/")[1] + "-AttnQuantOnly-Group" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) From c2a20165770974a77d20373042beab4e12a86720 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Thu, 6 Mar 2025 23:20:31 -0500 Subject: [PATCH 3/6] channel wise fp8 quantization, attention modules Signed-off-by: George Ohashi --- .../llama3_fp8_kv_example.py | 52 ++++++++++--------- .../modifiers/quantization/calibration.py | 4 +- src/llmcompressor/observers/base.py | 47 +++++++++-------- 3 files changed, 55 insertions(+), 48 deletions(-) diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 062edc562..6aeef060c 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -58,33 +58,35 @@ def process_and_tokenize(example): strategy: channel dynamic: false symmetric: true - targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] + # targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] + targets: ['re:.*q_proj',] + """ -recipe = """ -quant_stage: - quant_modifiers: - QuantizationModifier: - config_groups: - fp8_attention_q_proj: - output_activations: - num_bits: 8 - type: float - strategy: group - group_size: 512 - dynamic: false - symmetric: true - targets: ['re:.*q_proj'] - fp8_attention_kv_proj: - output_activations: - num_bits: 8 - type: float - strategy: group - group_size: 128 - dynamic: false - symmetric: true - targets: ['re:.*k_proj', 're:.*v_proj'] +# recipe = """ +# quant_stage: +# quant_modifiers: +# QuantizationModifier: +# config_groups: +# fp8_attention_q_proj: +# output_activations: +# num_bits: 8 +# type: float +# strategy: channel +# # group_size: 512 +# dynamic: false +# symmetric: true +# targets: ['re:.*q_proj'] +# # fp8_attention_kv_proj: +# # output_activations: +# # num_bits: 8 +# # type: float +# # strategy: group +# # group_size: 128 +# # dynamic: false +# # symmetric: true +# # targets: ['re:.*k_proj', 're:.*v_proj'] -""" +# """ # Apply algorithms. oneshot( diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py index bcb4b7433..37cca0ac8 100644 --- a/src/llmcompressor/modifiers/quantization/calibration.py +++ b/src/llmcompressor/modifiers/quantization/calibration.py @@ -81,7 +81,9 @@ def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor] raise ValueError("Must provide a value to observe if not using weight observer") observer = getattr(module, f"{base_name}_observer") - updated_scale, updated_zero_point = observer(value, g_idx=g_idx) + updated_scale, updated_zero_point = observer( + value, g_idx=g_idx, base_name=base_name + ) # update scale and zero point update_parameter_data(module, updated_scale, f"{base_name}_scale") diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index 2aa09b067..a9a107559 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -31,7 +31,10 @@ def __init__(self, quantization_args: QuantizationArgs): @torch.no_grad() def forward( - self, observed: Tensor, g_idx: Optional[Tensor] = None + self, + observed: Tensor, + g_idx: Optional[Tensor] = None, + base_name: Optional[str] = None, ) -> Tuple[FloatTensor, IntTensor]: """ maps directly to get_qparams @@ -40,8 +43,9 @@ def forward( :param g_idx: optional mapping from column index to group index :return: tuple of scale and zero point based on last observed value """ + # breakpoint() self.record_observed_tokens(observed) - return self.get_qparams(observed=observed, g_idx=g_idx) + return self.get_qparams(observed=observed, g_idx=g_idx, base_name=base_name) def calculate_qparams( self, @@ -66,6 +70,7 @@ def get_qparams( self, observed: Optional[Tensor] = None, g_idx: Optional[Tensor] = None, + base_name: Optional[str] = None, ) -> Tuple[FloatTensor, IntTensor]: """ Convenience function to wrap overwritten calculate_qparams @@ -123,26 +128,24 @@ def get_qparams( self._zero_point[:, group_index] = zero_point.squeeze(1) elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL: - # assume observed is transposed, because its the output, hence use dim 0 - # we pass in [1, 8, 2048, 128] for k_states - # normally per channel: (output_dim, 1) and you have as many scales as the output_dim - # we want 8 - num_k_head_scales? or - #breakpoint() - - # weight --> get scales along the first dimension (output dim is first dim) - # weight shape (output_dim, input_dim) - # self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0) - # output when applied to the weight: (output_dim, 1) - - - # for outputs: - self._scale, self._zero_point = self.get_qparams_along_dim(observed, 2) - self._scale = self._scale.squeeze(1) - self._zero_point = self._zero_point.squeeze(1) - # why is the output of self._scale: [1, 1, 1] - - - + if base_name == "output": + # the last dimension is the hidden dimension + # shape of [1,1, num_key_value_heads * head_dim] + scale, zero_point = self.get_qparams_along_dim( + observed, observed.ndim - 1 + ) + self._scale = ( + scale.squeeze() + ) # shape of [num_key_value_heads * head_dim] + self._zero_point = ( + zero_point.squeeze() + ) # shape of [num_key_value_heads * head_dim] + else: + # weight or input + # assume observed is transposed, because its the output, hence use dim 0 + self._scale, self._zero_point = self.get_qparams_along_dim( + observed, 0 + ) elif self.quantization_args.strategy == QuantizationStrategy.TOKEN: # use dim 1, assume the obsersed.shape = [batch, token, hidden] From 189e9d5e7c490700b1aa20eb54a77c2283febe94 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Thu, 6 Mar 2025 23:22:49 -0500 Subject: [PATCH 4/6] revert example script Signed-off-by: George Ohashi --- .../llama3_fp8_kv_example.py | 53 +++++++------------ 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py index 6aeef060c..72872b913 100644 --- a/examples/quantization_kv_cache/llama3_fp8_kv_example.py +++ b/examples/quantization_kv_cache/llama3_fp8_kv_example.py @@ -6,7 +6,6 @@ # Select model and load it. MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", @@ -20,7 +19,7 @@ # Select number of samples. 512 samples is a good place to start. # Increasing the number of samples can improve accuracy. -NUM_CALIBRATION_SAMPLES = 10 +NUM_CALIBRATION_SAMPLES = 512 MAX_SEQUENCE_LENGTH = 2048 # Load dataset and preprocess. @@ -50,43 +49,29 @@ def process_and_tokenize(example): quant_stage: quant_modifiers: QuantizationModifier: + ignore: ["lm_head"] config_groups: - fp8_attention: - output_activations: + group_0: + weights: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true + input_activations: num_bits: 8 type: float - strategy: channel + strategy: tensor dynamic: false symmetric: true - # targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] - targets: ['re:.*q_proj',] - + targets: ["Linear"] + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true """ -# recipe = """ -# quant_stage: -# quant_modifiers: -# QuantizationModifier: -# config_groups: -# fp8_attention_q_proj: -# output_activations: -# num_bits: 8 -# type: float -# strategy: channel -# # group_size: 512 -# dynamic: false -# symmetric: true -# targets: ['re:.*q_proj'] -# # fp8_attention_kv_proj: -# # output_activations: -# # num_bits: 8 -# # type: float -# # strategy: group -# # group_size: 128 -# # dynamic: false -# # symmetric: true -# # targets: ['re:.*k_proj', 're:.*v_proj'] - -# """ # Apply algorithms. oneshot( @@ -111,6 +96,6 @@ def process_and_tokenize(example): print("==========================================\n\n") # Save to disk compressed. -SAVE_DIR = MODEL_ID.split("/")[1] + "-AttnQuantOnly-Group" +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" model.save_pretrained(SAVE_DIR, save_compressed=True) tokenizer.save_pretrained(SAVE_DIR) From 78222babd6a29bf85f409a587a2aa098b8bc9823 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Thu, 6 Mar 2025 23:36:00 -0500 Subject: [PATCH 5/6] lint Signed-off-by: George Ohashi --- src/llmcompressor/observers/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index a9a107559..fd8b977a0 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -142,7 +142,8 @@ def get_qparams( ) # shape of [num_key_value_heads * head_dim] else: # weight or input - # assume observed is transposed, because its the output, hence use dim 0 + # assume observed is transposed, + # because its the output, hence use dim 0 self._scale, self._zero_point = self.get_qparams_along_dim( observed, 0 ) From 5d13e2b7e5eb9c6aba368533a704e2ef598e85d8 Mon Sep 17 00:00:00 2001 From: George Ohashi Date: Fri, 7 Mar 2025 09:28:36 -0500 Subject: [PATCH 6/6] kv-cache int8 quant Signed-off-by: George Ohashi --- src/llmcompressor/modifiers/quantization/cache.py | 11 +++++++++-- src/llmcompressor/observers/base.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py index 9b7a62141..4d9f48e62 100644 --- a/src/llmcompressor/modifiers/quantization/cache.py +++ b/src/llmcompressor/modifiers/quantization/cache.py @@ -151,8 +151,15 @@ def _quantize(self, tensor, kv_type, layer_idx): scales = self.v_scales zps = self.v_zps - # tensor - scale, zp = observer(tensor) + # note: key, value states are in the shape: + # [batch, num_key_value_heads, seq_len, head_dim] + + base_name = None # tensor-wise quantization, shape of [1] + if self.quantization_args.strategy == "channel": + # target last dim to quantize, shape of [head_dim] + base_name = "kv_cache" + + scale, zp = observer(tensor, base_name=base_name) if len(scales) <= layer_idx: scales.append(scale) zps.append(zp) diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py index fd8b977a0..9bc030b52 100644 --- a/src/llmcompressor/observers/base.py +++ b/src/llmcompressor/observers/base.py @@ -128,7 +128,7 @@ def get_qparams( self._zero_point[:, group_index] = zero_point.squeeze(1) elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL: - if base_name == "output": + if base_name in ("output", "kv_cache"): # the last dimension is the hidden dimension # shape of [1,1, num_key_value_heads * head_dim] scale, zero_point = self.get_qparams_along_dim(