Fix WoQ INT4 error (#3563)

ZailiWang · web-flow · commit 66681db4e734 · 2025-03-18T08:24:20.000+08:00
* Fix WoQ INT4 error

* correct attri name

* fix typo

* remove outdated unset lines in LLM inference README.md

* Update logic to accommodate all precision singleRank &amp; TP cases

* correct flake8 error
diff --git a/examples/cpu/llm/inference/README.md b/examples/cpu/llm/inference/README.md
@@ -224,7 +224,6 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Met
 # Assuming the pre-sharded Llama model is generated at "saved_results/llama_local_shard/" folder.
 # run_accuracy_with_deepspeed.py script is under "distributed" directory.
 cd distributed
-unset KMP_AFFINITY
 
 # Distributed inference in FP32
 deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model "../saved_results/llama_local_shard/" --dtype float32 --ipex --tasks lambada_openai
@@ -431,11 +430,7 @@ If your INT4 checkpoints are not from HuggingFace or INC, please make sure the d
 
 ### 2.2.2 Run generation in distributed way
 
-#### 2.2.2.1 Prepare:
-
-```bash
-unset KMP_AFFINITY
-```
+#### 2.2.2.1 Prologue:
 
 In the DeepSpeed cases below, we recommend "--shard-model" to shard model weight sizes more even for better memory usage when running with DeepSpeed.
 
@@ -660,7 +655,7 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Met
 
 ### 3.2.2 Run in distributed way
 
-#### 3.2.2.1 Prepare:
+#### 3.2.2.1 Prologue:
 
 We provided a `run_accuracy_with_deepspeed.py` script for testing accuracy
 for the models benchmarked in distributed way via `deepspeed`.
@@ -676,7 +671,6 @@ the path of the folder of the sharded model instead of original model ID.
 ```bash
 # Run distributed accuracy with 2 ranks of one node
 cd ./distributed
-unset KMP_AFFINITY
 ```
 
 #### 3.2.2.2 FP32:
diff --git a/examples/cpu/llm/inference/run.py b/examples/cpu/llm/inference/run.py
@@ -383,7 +383,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             if args.audio is not None:
                 infer_cmd.extend(["--audio", str(args.audio)])
 
-            print("LLM RUNTIME INFO: running model geneartion...")
+            print("LLM RUNTIME INFO: running model generation...")
             result = subprocess.run(infer_cmd)
             if result.returncode != 0:
                 print("LLM RUNTIME ERROR: Running generation task failed. Quit.")
@@ -564,7 +564,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
             if args.config_file is not None:
                 infer_cmd.extend(["--config-file", str(args.config_file)])
 
-            print("LLM RUNTIME INFO: running model geneartion...")
+            print("LLM RUNTIME INFO: running model generation...")
             result = subprocess.run(infer_cmd)
             if result.returncode != 0:
                 print("LLM RUNTIME ERROR: Running generation task failed. Quit.")
@@ -700,7 +700,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         if args.cache_weight_for_large_batch:
             infer_cmd.extend(["--cache-weight-for-large-batch"])
 
-        print("LLM RUNTIME INFO: running model geneartion with deepspeed (autotp)...")
+        print("LLM RUNTIME INFO: running model generation with deepspeed (autotp)...")
         result = subprocess.run(infer_cmd)
         if result.returncode != 0:
             print("LLM RUNTIME ERROR: Running generation task failed. Quit.")
diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py
@@ -2667,15 +2667,27 @@ def __init__(self, module, config, sdp_module_ref, distributed=False):
         elif hasattr(module, "all_head_size"):
             self.hidden_size = module.all_head_size
         elif hasattr(module, "q_proj"):
-            if hasattr(module.q_proj, "weight"):
+            if hasattr(module.q_proj, "out_features"):
+                self.hidden_size = module.q_proj.out_features
+            elif hasattr(module.q_proj, "linear") and hasattr(
+                module.q_proj.linear, "out_features"
+            ):
+                self.hidden_size = module.q_proj.linear.out_features
+            elif hasattr(module.q_proj, "weight"):
                 self.hidden_size = module.q_proj.weight.shape[0]
             else:
                 self.hidden_size = module.q_proj.linear.weight.shape[0]
         elif hasattr(module, "o_proj"):
-            if hasattr(module.o_proj, "weight"):
-                self.hidden_size = module.o_proj.weight.shape[0]
+            if hasattr(module.o_proj, "in_features"):
+                self.hidden_size = module.q_proj.in_features
+            elif hasattr(module.o_proj, "linear") and hasattr(
+                module.o_proj.linear, "in_features"
+            ):
+                self.hidden_size = module.q_proj.linear.in_features
+            elif hasattr(module.o_proj, "weight"):
+                self.hidden_size = module.o_proj.weight.shape[1]
             else:
-                self.hidden_size = module.o_proj.linear.weight.shape[0]
+                self.hidden_size = module.o_proj.linear.weight.shape[1]
 
         # common known as num of attention_heads
         if hasattr(module, "num_attention_heads"):
diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py
@@ -2400,11 +2400,15 @@ def optimize(
 
             elif use_low_precision_checkpoint:  # weight only quantization
                 # for non-quantized layers
-                from ..nn.utils._weight_prepack import (
-                    weight_prepack_with_ipex,
-                )
+                from ..frontend import optimize as ipex_optimize
 
-                _model = weight_prepack_with_ipex(_model, None, {})[0]
+                _model = _model.to(dtype)
+                _model = ipex_optimize(
+                    _model,
+                    dtype=dtype,
+                    inplace=True,
+                    auto_kernel_selection=(dtype == torch.float),
+                )
             else:
                 # Note that low precision checkpoint is already handled at the beginning.
                 # If checkpoint is not provided, model is quantized here