Skip to content

Commit 66681db

Browse files
authored
Fix WoQ INT4 error (#3563)
* Fix WoQ INT4 error * correct attri name * fix typo * remove outdated unset lines in LLM inference README.md * Update logic to accommodate all precision singleRank & TP cases * correct flake8 error
1 parent 1e9c75b commit 66681db

File tree

4 files changed

+29
-19
lines changed

4 files changed

+29
-19
lines changed

examples/cpu/llm/inference/README.md

+2-8
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,6 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Met
224224
# Assuming the pre-sharded Llama model is generated at "saved_results/llama_local_shard/" folder.
225225
# run_accuracy_with_deepspeed.py script is under "distributed" directory.
226226
cd distributed
227-
unset KMP_AFFINITY
228227

229228
# Distributed inference in FP32
230229
deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model "../saved_results/llama_local_shard/" --dtype float32 --ipex --tasks lambada_openai
@@ -431,11 +430,7 @@ If your INT4 checkpoints are not from HuggingFace or INC, please make sure the d
431430

432431
### 2.2.2 Run generation in distributed way
433432

434-
#### 2.2.2.1 Prepare:
435-
436-
```bash
437-
unset KMP_AFFINITY
438-
```
433+
#### 2.2.2.1 Prologue:
439434

440435
In the DeepSpeed cases below, we recommend "--shard-model" to shard model weight sizes more even for better memory usage when running with DeepSpeed.
441436

@@ -660,7 +655,7 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Met
660655

661656
### 3.2.2 Run in distributed way
662657

663-
#### 3.2.2.1 Prepare:
658+
#### 3.2.2.1 Prologue:
664659

665660
We provided a `run_accuracy_with_deepspeed.py` script for testing accuracy
666661
for the models benchmarked in distributed way via `deepspeed`.
@@ -676,7 +671,6 @@ the path of the folder of the sharded model instead of original model ID.
676671
```bash
677672
# Run distributed accuracy with 2 ranks of one node
678673
cd ./distributed
679-
unset KMP_AFFINITY
680674
```
681675

682676
#### 3.2.2.2 FP32:

examples/cpu/llm/inference/run.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
383383
if args.audio is not None:
384384
infer_cmd.extend(["--audio", str(args.audio)])
385385

386-
print("LLM RUNTIME INFO: running model geneartion...")
386+
print("LLM RUNTIME INFO: running model generation...")
387387
result = subprocess.run(infer_cmd)
388388
if result.returncode != 0:
389389
print("LLM RUNTIME ERROR: Running generation task failed. Quit.")
@@ -564,7 +564,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
564564
if args.config_file is not None:
565565
infer_cmd.extend(["--config-file", str(args.config_file)])
566566

567-
print("LLM RUNTIME INFO: running model geneartion...")
567+
print("LLM RUNTIME INFO: running model generation...")
568568
result = subprocess.run(infer_cmd)
569569
if result.returncode != 0:
570570
print("LLM RUNTIME ERROR: Running generation task failed. Quit.")
@@ -700,7 +700,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
700700
if args.cache_weight_for_large_batch:
701701
infer_cmd.extend(["--cache-weight-for-large-batch"])
702702

703-
print("LLM RUNTIME INFO: running model geneartion with deepspeed (autotp)...")
703+
print("LLM RUNTIME INFO: running model generation with deepspeed (autotp)...")
704704
result = subprocess.run(infer_cmd)
705705
if result.returncode != 0:
706706
print("LLM RUNTIME ERROR: Running generation task failed. Quit.")

intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -2667,15 +2667,27 @@ def __init__(self, module, config, sdp_module_ref, distributed=False):
26672667
elif hasattr(module, "all_head_size"):
26682668
self.hidden_size = module.all_head_size
26692669
elif hasattr(module, "q_proj"):
2670-
if hasattr(module.q_proj, "weight"):
2670+
if hasattr(module.q_proj, "out_features"):
2671+
self.hidden_size = module.q_proj.out_features
2672+
elif hasattr(module.q_proj, "linear") and hasattr(
2673+
module.q_proj.linear, "out_features"
2674+
):
2675+
self.hidden_size = module.q_proj.linear.out_features
2676+
elif hasattr(module.q_proj, "weight"):
26712677
self.hidden_size = module.q_proj.weight.shape[0]
26722678
else:
26732679
self.hidden_size = module.q_proj.linear.weight.shape[0]
26742680
elif hasattr(module, "o_proj"):
2675-
if hasattr(module.o_proj, "weight"):
2676-
self.hidden_size = module.o_proj.weight.shape[0]
2681+
if hasattr(module.o_proj, "in_features"):
2682+
self.hidden_size = module.q_proj.in_features
2683+
elif hasattr(module.o_proj, "linear") and hasattr(
2684+
module.o_proj.linear, "in_features"
2685+
):
2686+
self.hidden_size = module.q_proj.linear.in_features
2687+
elif hasattr(module.o_proj, "weight"):
2688+
self.hidden_size = module.o_proj.weight.shape[1]
26772689
else:
2678-
self.hidden_size = module.o_proj.linear.weight.shape[0]
2690+
self.hidden_size = module.o_proj.linear.weight.shape[1]
26792691

26802692
# common known as num of attention_heads
26812693
if hasattr(module, "num_attention_heads"):

intel_extension_for_pytorch/transformers/optimize.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -2400,11 +2400,15 @@ def optimize(
24002400

24012401
elif use_low_precision_checkpoint: # weight only quantization
24022402
# for non-quantized layers
2403-
from ..nn.utils._weight_prepack import (
2404-
weight_prepack_with_ipex,
2405-
)
2403+
from ..frontend import optimize as ipex_optimize
24062404

2407-
_model = weight_prepack_with_ipex(_model, None, {})[0]
2405+
_model = _model.to(dtype)
2406+
_model = ipex_optimize(
2407+
_model,
2408+
dtype=dtype,
2409+
inplace=True,
2410+
auto_kernel_selection=(dtype == torch.float),
2411+
)
24082412
else:
24092413
# Note that low precision checkpoint is already handled at the beginning.
24102414
# If checkpoint is not provided, model is quantized here

0 commit comments

Comments
 (0)