Skip to content

Commit

Permalink
Add INC WoQ and remove ITREX dependency (huggingface#880)
Browse files Browse the repository at this point in the history
* add inc woq and remove itrex dependency

Signed-off-by: changwangss <[email protected]>

* Update optimum/intel/neural_compressor/modeling_base.py

Co-authored-by: Ella Charlaix <[email protected]>

* Update optimum/intel/neural_compressor/modeling_base.py

Co-authored-by: Ella Charlaix <[email protected]>

* Update optimum/intel/neural_compressor/modeling_base.py

Co-authored-by: Ella Charlaix <[email protected]>

* Update optimum/intel/neural_compressor/modeling_base.py

Co-authored-by: Ella Charlaix <[email protected]>

* fix code according comment

Signed-off-by: changwangss <[email protected]>

* add logger setting

Signed-off-by: changwangss <[email protected]>

* improve ut

Signed-off-by: changwangss <[email protected]>

* move woq quantization to quantization.py

Signed-off-by: changwangss <[email protected]>

* Update examples/neural_compressor/language-modeling/run_clm.py

Co-authored-by: Ilyas Moutawwakil <[email protected]>

* Update examples/neural_compressor/language-modeling/run_clm.py

Co-authored-by: Ilyas Moutawwakil <[email protected]>

* remove dependency

Signed-off-by: changwangss <[email protected]>

* Update examples/neural_compressor/language-modeling/run_clm.py

* add woq saving and loading ut and logger info

Signed-off-by: changwangss <[email protected]>

* set transformers version limit

Signed-off-by: changwangss <[email protected]>

* fix installation neural_compressor[pt]

Signed-off-by: changwangss <[email protected]>

* improve ut

Signed-off-by: changwangss <[email protected]>

* refactoring

* Refactor

* revert

* fix datasets loading issue

Signed-off-by: changwangss <[email protected]>

* fix

---------

Signed-off-by: changwangss <[email protected]>
Co-authored-by: Ella Charlaix <[email protected]>
Co-authored-by: Ilyas Moutawwakil <[email protected]>
Co-authored-by: Ella Charlaix <[email protected]>
  • Loading branch information
4 people authored Sep 9, 2024
1 parent 4dc4d57 commit 8a015a6
Show file tree
Hide file tree
Showing 11 changed files with 251 additions and 277 deletions.
5 changes: 2 additions & 3 deletions .github/workflows/test_inc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,15 @@ jobs:
pip install cmake
pip install py-cpuinfo
pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu
pip install intel-extension-for-pytorch==2.3.0
pip install datasets==2.19.0
pip install .[neural-compressor,diffusers,tests]
pip install intel-extension-for-transformers
pip install peft

- name: Test with Pytest
run: |
pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0
- name: Test IPEX
run: |
pip uninstall -y intel-extension-for-transformers
pip install intel-extension-for-pytorch==2.3.0
pytest tests/neural_compressor/test_ipex.py

2 changes: 1 addition & 1 deletion examples/neural_compressor/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,4 @@ respectively `dynamic`, `static`, `weight_only` or `aware_training`.

The flag `--verify_loading` can be passed along to verify that the resulting quantized model can be loaded correctly.

> **_Note:_** `weight_only` quantization_approach requires `neural-compressor` >= 2.3 and `intel-extension-for-transformers` >= 1.3.
> **_Note:_** `weight_only` quantization_approach requires `neural-compressor` > 3.0.
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ torch >= 1.9
datasets >= 1.8.0
sentencepiece != 0.1.92
protobuf
intel-extension-for-transformers >= 1.3
peft
40 changes: 24 additions & 16 deletions examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
QuantizationAwareTrainingConfig,
WeightPruningConfig,
)
from neural_compressor.transformers import GPTQConfig, RtnConfig
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
Expand All @@ -57,12 +58,8 @@
from transformers.utils.versions import require_version

from optimum.intel.neural_compressor import INCModelForCausalLM, INCQuantizer, INCTrainer
from optimum.intel.utils.import_utils import ITREX_IMPORT_ERROR, is_itrex_available


if is_itrex_available():
from intel_extension_for_transformers.transformers.utils.config import GPTQConfig, RtnConfig

os.environ["CUDA_VISIBLE_DEVICES"] = ""

# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
Expand Down Expand Up @@ -203,12 +200,8 @@ class OptimizationArguments:
metadata={"help": "Whether or not to verify the loading of the quantized model."},
)
bits: str = field(
default="4",
metadata={"help": "Bits number of weight for weight only quantization. 1~8 bits."},
)
weight_dtype: str = field(
default="int4_clip",
metadata={"help": "weight dtype for weight only quantization."},
default=4,
metadata={"help": "Bits number of weight for weight only quantization. only support 4 bits now."},
)
group_size: int = field(
default=-1,
Expand All @@ -223,7 +216,6 @@ class OptimizationArguments:
metadata={"help": "Scheme for weight only quantization. Choose from 'sym' and 'asym'."},
)
quantization_methodology: str = field(
choices=["rtn", "gptq"],
default="rtn",
metadata={"help": "Quantization methodology for weight only quantization. Choose from 'rtn' and 'gptq'."},
)
Expand All @@ -249,6 +241,11 @@ class OptimizationArguments:
metadata={"help": "Calibration dataset sequence max length, this should align with your model config"},
)

def __post_init__(self):
woq_algorithms = ["rtn", "gptq"]
if self.quantization_methodology not in woq_algorithms:
raise ValueError(f"Value must be one of {woq_algorithms}, got {self.quantization_methodology}")


@dataclass
class DataTrainingArguments:
Expand Down Expand Up @@ -655,13 +652,11 @@ def compute_metrics(eval_preds):
else:
recipes = {}
if optim_args.quantization_approach == "weight_only":
if not is_itrex_available():
raise ImportError(ITREX_IMPORT_ERROR.format("WeightOnly quantization"))
if optim_args.apply_pruning or optim_args.apply_distillation:
raise ValueError("Weight only quantization and pruning or distillation cannot be combined.")

algorithm_args = {
"weight_dtype": optim_args.weight_dtype,
"bits": optim_args.bits,
"sym": optim_args.weight_only_scheme == "sym",
"group_size": optim_args.group_size,
}
Expand Down Expand Up @@ -756,10 +751,10 @@ def compute_metrics(eval_preds):
trainer.save_metrics("train", metrics)
trainer.save_state()

if optim_args.apply_quantization and optim_args.quantization_approach in {"static", "dynamic", "weight_only"}:
if optim_args.apply_quantization and optim_args.quantization_approach in {"static", "dynamic"}:
model = trainer.model if isinstance(trainer.model, PreTrainedModel) else trainer.model._model
quantizer = INCQuantizer.from_pretrained(model)
if optim_args.quantization_approach in ["static", "weight_only"]:
if optim_args.quantization_approach == "static":
num_calibration_samples = min(len(train_dataset), optim_args.num_calibration_samples)
train_dataset = train_dataset.select(range(num_calibration_samples))
quantization_config.calibration_sampling_size = num_calibration_samples
Expand All @@ -776,6 +771,19 @@ def compute_metrics(eval_preds):
)
trainer.model = quantizer._quantized_model

if optim_args.apply_quantization and optim_args.quantization_approach == "weight_only":
model = trainer.model if isinstance(trainer.model, PreTrainedModel) else trainer.model._model
num_calibration_samples = min(len(train_dataset), optim_args.num_calibration_samples)
train_dataset = train_dataset.select(range(num_calibration_samples))
quantization_config.calibration_sampling_size = num_calibration_samples
quantized_model = INCModelForCausalLM.from_pretrained(
model_args.model_name_or_path, quantization_config=quantization_config
)
if hasattr(quantization_config, "tokenizer"):
quantization_config.tokenizer.save_pretrained(training_args.output_dir)
quantized_model.save_pretrained(training_args.output_dir)
trainer.model = quantized_model

if optim_args.apply_quantization and optim_args.verify_loading:
loaded_model = INCModelForCausalLM.from_pretrained(training_args.output_dir)
tokens = tokenizer("This is a sample input", return_tensors="pt")
Expand Down
130 changes: 82 additions & 48 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from huggingface_hub import hf_hub_download
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from huggingface_hub.utils import EntryNotFoundError
from neural_compressor.transformers import GPTQConfig, RtnConfig
from neural_compressor.transformers.models.modeling_auto import _BaseINCAutoModelClass
from neural_compressor.utils.pytorch import load
from transformers import (
AutoConfig,
Expand All @@ -47,8 +49,9 @@
from optimum.intel.generation import BaseModelForCausalLM

from ...modeling_base import OptimizedModel
from ..utils.import_utils import _torch_version, is_itrex_available, is_torch_version
from ..utils.import_utils import _torch_version, is_torch_version
from .configuration import INCConfig
from .quantization import _weight_only_quantization
from .utils import QUANTIZATION_CONFIG_NAME


Expand Down Expand Up @@ -122,8 +125,85 @@ def _from_pretrained(
raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
token = use_auth_token

quantization_config = kwargs.pop("quantization_config", None)
model_path = Path(model_id)
is_local = model_path.is_dir()

# ITREX compatibility
quantization_config_path = None
if is_local:
quantization_config_path = model_path / subfolder / QUANTIZATION_CONFIG_NAME
else:
try:
quantization_config_path = hf_hub_download(
repo_id=model_id,
filename=QUANTIZATION_CONFIG_NAME,
subfolder=subfolder,
token=token,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
)
except EntryNotFoundError:
pass
if quantization_config_path and Path(quantization_config_path).is_file():
algorithm = getattr(quantization_config, "quant_method", None)
if algorithm in {"rtn", "gptq", "awq", "autoround"}:
raise ValueError(
"This model was obtained through ITREX quantization, support for ITREX models is deprecated since neural-compressor v3.0. "
"To load this model please downgrade both optimum-intel and neural-compressor."
)
# quantization_config = PretrainedConfig.from_pretrained(quantization_config_path)
# config.quantization_config = quantization_config.to_dict()

if hasattr(config, "quantization_config"):
if config.quantization_config is None:
raise ValueError(
"The loading of `quantization_config` failed, to load this model please make sure the config is compatible"
)
else:
try:
logger.info(
"The weight only quantized model loading only supports the same format as GPTQ, such as https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/tree/main."
)
_BaseINCAutoModelClass.ORIG_MODEL = cls.auto_model_class
model = _BaseINCAutoModelClass.load_low_bit(
model_id,
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
token=token,
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
config=config,
**kwargs,
)
logger.info("Saved low bit model loading successfully. Other input args " "will be ignored.")
return model
except Exception as e:
raise RuntimeError(f"The quantized model cannot be loaded. Detailed error: {e}")
if isinstance(quantization_config, (RtnConfig, GPTQConfig)):
logger.info(
"The quantized model parameters will be saved in the same format as GPTQ, here is the sample model https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/tree/main for details."
)
model = _weight_only_quantization(
cls.auto_model_class,
model_id,
quantization_config=quantization_config,
token=token,
revision=revision,
force_download=force_download,
cache_dir=cache_dir,
local_files_only=local_files_only,
subfolder=subfolder,
trust_remote_code=trust_remote_code,
**kwargs,
)

return cls(model, config=config, model_save_dir=None, **kwargs).model

model_cache_path = None
inc_config = None
msg = None
Expand Down Expand Up @@ -165,52 +245,6 @@ def _from_pretrained(

model_save_dir = Path(model_cache_path).parent

if is_itrex_available():
quantization_config_path = None
if is_local:
quantization_config_path = model_path / subfolder / QUANTIZATION_CONFIG_NAME
else:
try:
quantization_config_path = hf_hub_download(
repo_id=model_id,
filename=QUANTIZATION_CONFIG_NAME,
subfolder=subfolder,
token=token,
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
local_files_only=local_files_only,
)
except EntryNotFoundError:
pass

if quantization_config_path and Path(quantization_config_path).is_file():
quantization_config = PretrainedConfig.from_pretrained(quantization_config_path)
algorithm = getattr(quantization_config, "quant_method", None)
if algorithm in {"rtn", "gptq", "awq", "autoround"}:
from intel_extension_for_transformers.transformers.modeling.modeling_auto import (
_BaseQBitsAutoModelClass,
)

_BaseQBitsAutoModelClass.ORIG_MODEL = cls.auto_model_class

model = _BaseQBitsAutoModelClass.from_pretrained(
pretrained_model_name_or_path=model_id,
token=token,
revision=revision,
force_download=force_download,
cache_dir=cache_dir,
local_files_only=local_files_only,
subfolder=subfolder,
trust_remote_code=trust_remote_code,
use_neural_speed=False,
**kwargs,
)

return cls(
model, config=config, model_save_dir=model_save_dir, q_config=quantization_config, **kwargs
)

try:
inc_config = INCConfig.from_pretrained(model_id, subfolder=subfolder, revision=revision)
if not is_torch_version("==", inc_config.torch_version):
Expand Down Expand Up @@ -254,7 +288,7 @@ def _from_pretrained(

def _save_pretrained(self, save_directory: Union[str, Path]):
if isinstance(self.model, torch.nn.Module):
# For ITREX model
# For INC weight only model
if isinstance(self._q_config, PretrainedConfig):
self._q_config.to_json_file(os.path.join(save_directory, QUANTIZATION_CONFIG_NAME))
self.model.save_pretrained(save_directory)
Expand Down
Loading

0 comments on commit 8a015a6

Please sign in to comment.