Skip to content

Commit

Permalink
Merge branch 'master' into xuehao/update_ci
Browse files Browse the repository at this point in the history
Signed-off-by: Sun, Xuehao <[email protected]>
  • Loading branch information
XuehaoSun committed Dec 26, 2024
2 parents 649dbb4 + adf0ca7 commit f2e357a
Show file tree
Hide file tree
Showing 12 changed files with 34 additions and 18 deletions.
2 changes: 2 additions & 0 deletions .azure-pipelines/ut-3x-pt-fp8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pr:
- neural_compressor/torch
- test/3x/torch/algorithms/fp8_quant
- test/3x/torch/quantization/fp8_quant
- test/3x/torch/quantization/weight_only/test_rtn.py
- test/3x/torch/quantization/weight_only/test_load.py
- setup.py
- requirements_pt.txt

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ Following example code demonstrates FP8 Quantization, it is supported by Intel G
To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
```bash
# Run a container with an interactive shell
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu24.04/habanalabs/pytorch-installer-2.5.1:latest
```
Run the example:
```python
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,5 @@ lm_eval==0.4.3
peft
numba
tbb
# TODO: (Yi) SW-208079 replace auto-round with the released version
auto-round-hpu @ git+https://github.com/intel/auto-round.git@hpu_only_pkg
optimum-habana==1.14.1
auto-round @ git+https://github.com/intel/[email protected]
optimum-habana==1.14.1
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ function init_params {
batch_size=16
tuned_checkpoint=saved_results
task=lambada_openai
incbench_cmd="incbench --num_cores_per_instance 4"
echo ${max_eval_samples}
for var in "$@"
do
Expand Down Expand Up @@ -104,6 +105,7 @@ function run_benchmark {
elif [ "${topology}" = "opt_125m_woq_autoround_int4_hpu" ]; then
model_name_or_path="facebook/opt-125m"
extra_cmd=$extra_cmd" --woq_algo AutoRound"
incbench_cmd="incbench --num_instances 1"
elif [ "${topology}" = "opt_125m_woq_autotune_int4" ]; then
model_name_or_path="facebook/opt-125m"
fi
Expand All @@ -116,7 +118,7 @@ function run_benchmark {
--batch_size ${batch_size} \
${extra_cmd} ${mode_cmd}
elif [[ ${mode} == "performance" ]]; then
incbench --num_cores_per_instance 4 run_clm_no_trainer.py \
${incbench_cmd} run_clm_no_trainer.py \
--model ${model_name_or_path} \
--batch_size ${batch_size} \
--output_dir ${tuned_checkpoint} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,9 @@ def get_user_model():
torchscript = True
if args.woq_algo == "AutoRound" and is_habana_framework_installed():
print("Quantizing model with AutoRound on HPU")
check_torch_compile_with_hpu_backend()
set_envs_for_torch_compile_with_hpu_backend()
if args.quantize:
check_torch_compile_with_hpu_backend()
set_envs_for_torch_compile_with_hpu_backend()
user_model = AutoModelForCausalLM.from_pretrained(
args.model,
trust_remote_code=args.trust_remote_code,
Expand Down Expand Up @@ -403,11 +404,12 @@ def calib_func(prepared_model):
max_seq_length=args.gptq_max_seq_length,
)
dataloader_for_calibration = dataloaderPreprocessor.get_prepared_dataloader()
from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device
from neural_compressor.torch.utils import get_model_device, move_input_device
from tqdm import tqdm
def run_fn_for_gptq(model, dataloader_for_calibration, *args):
for batch in tqdm(dataloader_for_calibration):
batch = move_input_to_device(batch, device=None)
device = get_model_device(model)
batch = move_input_device(batch, device=device)
if isinstance(batch, tuple) or isinstance(batch, list):
model(batch[0])
elif isinstance(batch, dict):
Expand Down Expand Up @@ -525,11 +527,12 @@ def run_fn_for_autoround(model, dataloader):
)
dataloader = dataloaderPreprocessor.get_prepared_dataloader()
custom_tune_config = TuningConfig(config_set=get_woq_tuning_config())
from neural_compressor.torch.algorithms.weight_only.utility import move_input_to_device
from neural_compressor.torch.utils import get_model_device, move_input_device
from tqdm import tqdm
def run_fn_for_gptq(model, dataloader_for_calibration, *args):
for batch in tqdm(dataloader_for_calibration):
batch = move_input_to_device(batch, device=None)
device = get_model_device(model)
batch = move_input_device(batch, device=device)
if isinstance(batch, tuple) or isinstance(batch, list):
model(batch[0])
elif isinstance(batch, dict):
Expand Down Expand Up @@ -568,7 +571,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):


if is_hpex_available():
from habana_frameworks.torch.hpu import wrap_in_hpu_graph
from habana_frameworks.torch.hpu.graphs import wrap_in_hpu_graph
user_model = user_model.to(torch.bfloat16)
wrap_in_hpu_graph(user_model, max_graphs=10)

Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
dump_elapsed_time,
)
from neural_compressor.common.base_config import options

from neural_compressor.common.version import __version__

__all__ = [
"options",
Expand Down
1 change: 1 addition & 0 deletions neural_compressor/common/version.py
2 changes: 2 additions & 0 deletions neural_compressor/evaluation/lm_eval/models/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,6 +969,8 @@ def _model_call(self, inps, attn_mask=None, labels=None):
output = output.logits
if self.pad_to_buckets and padding_length != 0: # use buckets to pad inputs
output = output[:, :-padding_length, :]
if "hpu" in output.device.type: # make sure return fp32 tensor for HPU, TODO: root cause
output = output.to(torch.float32)
return output

def _model_generate(self, context, max_length, stop, **generation_kwargs):
Expand Down
5 changes: 3 additions & 2 deletions neural_compressor/torch/algorithms/weight_only/save_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,7 @@ def _get_loaded_state_dict(self, config):
"_raise_exceptions_for_missing_entries": False,
"_commit_hash": commit_hash,
}
resolved_archive_file = self._get_resolved_archive_file(**kwargs)
resolved_archive_file, is_sharded = self._get_resolved_archive_file(**kwargs)

self._model_local_dir = os.path.abspath(os.path.expanduser(os.path.dirname(resolved_archive_file)))
# if hpu format tensor can be used directly, then update resolved_archive_file to the hpu format tensor file
Expand Down Expand Up @@ -640,6 +640,7 @@ def _get_resolved_archive_file(self, **kwargs):
subfolder = kwargs.get("subfolder")

resolved_archive_file = None
is_sharded = False
is_local = os.path.isdir(self.model_name_or_path)
if is_local: # pragma: no cover
# self.model_name_or_path is a local directory
Expand Down Expand Up @@ -787,7 +788,7 @@ def _get_resolved_archive_file(self, **kwargs):
if is_local:
resolved_archive_file = archive_file

return resolved_archive_file
return resolved_archive_file, is_sharded

def _init_hf_model(self, model_class, config):
from accelerate.big_modeling import init_empty_weights
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Use this module as an example of how to write new unit tests for layers."""

import os

import sys
import pytest
import torch

Expand Down Expand Up @@ -58,7 +58,12 @@ def run_predefined_config():
run_with_raised_exception(run_predefined_config, FileNotFoundError, "Failed to load file ")
# TODO [SW-196641]: fix the following issue:
elif quant_mode == QuantMode.SHAPE:
run_with_raised_exception(run_predefined_config, UnboundLocalError, "local variable 'fname_base' referenced before assignment")
error_message = (
"cannot access local variable 'fname_base' where it is not associated with a value"
if sys.version_info >= (3, 11)
else "local variable 'fname_base' referenced before assignment"
)
run_with_raised_exception(run_predefined_config, UnboundLocalError, error_message)
else:
run_predefined_config()

Expand Down
1 change: 1 addition & 0 deletions test/3x/torch/quantization/weight_only/test_rtn.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from neural_compressor.torch.utils import accelerator, is_hpex_available

device = accelerator.current_device_name()
torch.set_grad_enabled(False)


class ModelConv1d(torch.nn.Module):
Expand Down
2 changes: 1 addition & 1 deletion test/3x/torch/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
auto_round
deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.18.0
deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0
expecttest
intel_extension_for_pytorch
numpy
Expand Down

0 comments on commit f2e357a

Please sign in to comment.