diff --git a/README.md b/README.md index 3dbe557e5..97337f723 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,6 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) results = pipe("He's a dreadful magician and") - ``` For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction). @@ -231,7 +230,7 @@ For more details, please refer to the [documentation](https://intel.github.io/in ## Running the examples -Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference. +Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) and [`notebooks`](https://github.com/huggingface/optimum-intel/tree/main/notebooks) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference. Do not forget to install requirements for every example: diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel index 60fd51b42..a7f1dc978 100644 --- a/docker/Dockerfile.intel +++ b/docker/Dockerfile.intel @@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ libpng-dev \ python3 \ python3-pip \ + python3-dev \ + libnuma-dev \ && rm -rf /var/lib/apt/lists/*" RUN /usr/sbin/update-ccache-symlinks RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache @@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \ torchaudio==${TORCHAUDIO_VERSION} \ -f https://download.pytorch.org/whl/torch_stable.html && \ python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \ - python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \ + python3 -m pip install --no-cache-dir numa -ARG OMP_NUM_THREADS=1 -ENV OMP_NUM_THREADS=${OMP_NUM_THREADS} ARG KMP_BLOCKTIME=1 ENV KMP_BLOCKTIME=${KMP_BLOCKTIME} ARG KMP_HW_SUBSET=1T ENV KMP_HW_SUBSET=${KMP_HW_SUBSET} -ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so" \ No newline at end of file +ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so" diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 7053a17ef..94ae09bb6 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -30,5 +30,16 @@ title: Tutorials isExpanded: false title: OpenVINO + - sections: + - local: ipex/inference + title: Inference + - local: ipex/models + title: Supported Models + - sections: + - local: ipex/tutorials/notebooks + title: Notebooks + title: Tutorials + isExpanded: false + title: IPEX title: Optimum Intel isExpanded: false diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 75e99d868..c9ad66206 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -19,6 +19,8 @@ limitations under the License. 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures. +[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) (IPEX) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion. + [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target. [OpenVINO](https://docs.openvino.ai) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/2024/about-openvino/compatibility-and-support/supported-devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index aaab1b1f8..cb3e9c758 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -22,6 +22,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi |:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------| | [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"`| | [Intel OpenVINO](https://docs.openvino.ai ) | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"` | +| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade --upgrade-strategy eager "optimum[ipex]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. @@ -42,4 +43,4 @@ or to install from source including dependencies: python -m pip install "optimum-intel[extras]"@git+https://github.com/huggingface/optimum-intel.git ``` -where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`. +where `extras` can be one or more of `neural-compressor`, `openvino`, `ipex`. diff --git a/docs/source/ipex/inference.mdx b/docs/source/ipex/inference.mdx new file mode 100644 index 000000000..c712275e4 --- /dev/null +++ b/docs/source/ipex/inference.mdx @@ -0,0 +1,45 @@ + + +# Inference + +Optimum Intel can be used to load models from the [Hub](https://huggingface.co/models) and create pipelines to run inference with IPEX optimizations (including patching with custom operators, weight prepacking and graph mode) on a variety of Intel processors. For now support is only enabled for CPUs. + + +## Loading + +You can load your model and apply IPEX optimizations (including weight prepacking and graph mode). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators. +For now, support is only enabled for CPUs and the original model will be exported via TorchScript. In the future `torch.compile` will be used and model exported via TorchScript will get deprecated. + +```diff + import torch + from transformers import AutoTokenizer, pipeline +- from transformers import AutoModelForCausalLM ++ from optimum.intel import IPEXModelForCausalLM + + model_id = "gpt2" +- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) ++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + results = pipe("He's a dreadful magician and") +``` + +As shown in the table below, each task is associated with a class enabling to automatically load your model. + +| Auto Class | Task | +|--------------------------------------|--------------------------------------| +| `IPEXModelForSequenceClassification` | `text-classification` | +| `IPEXModelForTokenClassification` | `token-classification` | +| `IPEXModelForQuestionAnswering` | `question-answering` | +| `IPEXModelForImageClassification` | `image-classification` | +| `IPEXModel` | `feature-extraction` | +| `IPEXModelForMaskedLM` | `fill-mask` | +| `IPEXModelForAudioClassification` | `audio-classification` | +| `IPEXModelForCausalLM` | `text-generation` | diff --git a/docs/source/ipex/models.mdx b/docs/source/ipex/models.mdx new file mode 100644 index 000000000..346ca2659 --- /dev/null +++ b/docs/source/ipex/models.mdx @@ -0,0 +1,46 @@ + + +# Supported models + +🤗 Optimum provides IPEX optimizations for both eager mode and graph mode. It provides classes and functions to perform this step easily. +Here is the list of the supported architectures : + +## [Transformers](https://huggingface.co/docs/transformers/index) + +- Albert +- Bart +- Beit +- Bert +- BlenderBot +- BlenderBotSmall +- Bloom +- CodeGen +- DistilBert +- Electra +- Flaubert +- GPT-2 +- GPT-BigCode +- GPT-Neo +- GPT-NeoX +- Llama +- MPT +- Mistral +- MobileNet v1 +- MobileNet v2 +- MobileVit +- OPT +- ResNet +- Roberta +- Roformer +- SqueezeBert +- UniSpeech +- Vit +- Wav2Vec2 +- XLM diff --git a/docs/source/ipex/tutorials/notebooks.mdx b/docs/source/ipex/tutorials/notebooks.mdx new file mode 100644 index 000000000..2093e4fca --- /dev/null +++ b/docs/source/ipex/tutorials/notebooks.mdx @@ -0,0 +1,16 @@ + + +# Notebooks + +## Inference + +| Notebook | Description | | | +|:---------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------- |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|------:| +| [How to run inference with the IPEX](https://github.com/huggingface/optimum-intel/tree/main/notebooks/ipex) | Explains how to export your model to IPEX and to run inference with IPEX model on text-generation task | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/ipex/text_generation.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/ipex/text_generation.ipynb) | diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 3d8b44a56..742612ca3 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -190,6 +190,24 @@ def parse_args_openvino(parser: "ArgumentParser"): ) +def no_compression_parameter_provided(args): + return all( + ( + it is None + for it in ( + args.ratio, + args.group_size, + args.sym, + args.all_layers, + args.dataset, + args.num_samples, + args.awq, + args.sensitivity_metric, + ) + ) + ) + + class OVExportCommand(BaseOptimumCLICommand): COMMAND = CommandInfo(name="openvino", help="Export PyTorch models to OpenVINO IR.") @@ -230,23 +248,17 @@ def run(self): if self.args.weight_format is None: ov_config = None + if not no_compression_parameter_provided(self.args): + logger.warning( + "The provided compression parameters will not affect conversion because of the missing --weight-format argument." + ) elif self.args.weight_format in {"fp16", "fp32"}: ov_config = OVConfig(dtype=self.args.weight_format) else: is_int8 = self.args.weight_format == "int8" - # For int4 quantization if not parameter is provided, then use the default config if exist - if ( - not is_int8 - and self.args.ratio is None - and self.args.group_size is None - and self.args.sym is None - and self.args.all_layers is None - and self.args.dataset is None - and self.args.num_samples is None - and self.args.awq is None - and self.args.sensitivity_metric is None - ): + # For int4 quantization if no parameter is provided, then use the default config if exist + if no_compression_parameter_provided(self.args) and not is_int8: quantization_config = get_default_int4_config(self.args.model) else: quantization_config = { @@ -305,7 +317,7 @@ def run(self): model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config) model.save_pretrained(self.args.output) if not self.args.disable_convert_tokenizer: - maybe_convert_tokenizers(library_name, self.args.output, model) + maybe_convert_tokenizers(library_name, self.args.output, model, task=task) elif task.startswith("text-generation") and quantize_with_dataset: from optimum.intel import OVModelForCausalLM @@ -324,7 +336,7 @@ def run(self): preprocessors = maybe_load_preprocessors( self.args.model, trust_remote_code=self.args.trust_remote_code ) - maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors) + maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task) else: # TODO : add input shapes main_export( diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index fb711d973..216c1c391 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -13,6 +13,8 @@ # limitations under the License. from transformers.models.bert.modeling_bert import BertIntermediate +from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconForCausalLM +from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2LMHeadModel from transformers.models.llama.modeling_llama import ( LlamaDecoderLayer, LlamaForCausalLM, @@ -22,10 +24,14 @@ from transformers.models.vit.modeling_vit import ViTIntermediate from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version +from optimum.intel.utils.modeling_utils import replace_customized_linear_with_linear from .modeling_utils import ( _IPEX_MINIMUM_VERSION_FOR_PATCHING, + _gpt2_block_forward, _ipex_rms_layer_norm_forward, + _IPEXFalconDecoderLayer, + _IPEXGPT2Attention, _IPEXIntermediate, _IPEXLlamaDecoderLayer, _llama_model_forward, @@ -67,18 +73,56 @@ def patch_op(m, target_m, new_op_name, new_op): def _patch_llama_model(model): + """ + Patch llama model: + 1. Use IPEX Rope and IAKV cache + 2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add) + """ convert_functions(model, LlamaModel, "forward", _llama_model_forward) convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward) convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config) return model +def _patch_falcon_model(model): + """ + Patch falcon model: + 1. Disable SDPA so the attention mask will be compatible to ipex attention. + 2. Use IPEX Rope and IAKV cache + 3. Linear fusion with (Linear + Gelu) and (Linear + Add + Add) + """ + model.transformer._use_sdpa = False + replace_customized_linear_with_linear(model) + convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.config) + return model + + +def _patch_gpt2_model(model): + """ + Patch gpt2 model: + 1. Disable SDPA so the attention mask will be compatible to ipex attention. + 2. Use IAKV cache + """ + model.transformer._attn_implementation = "eager" + convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config) + convert_functions(model, GPT2Block, "forward", _gpt2_block_forward) + return model + + def _patch_bert_model(model): + """ + Patch bert model: + 1. Linear fusion with Linear + Gelu + """ convert_class(model, BertIntermediate, _IPEXIntermediate) return model def _patch_vit_model(model): + """ + Patch vit model: + 1. Linear fusion with Linear + Gelu + """ convert_class(model, ViTIntermediate, _IPEXIntermediate) return model @@ -94,6 +138,10 @@ def _patch_model(model): ) if isinstance(model, LlamaForCausalLM): model = _patch_llama_model(model) + elif isinstance(model, FalconForCausalLM): + model = _patch_falcon_model(model) + elif isinstance(model, GPT2LMHeadModel): + model = _patch_gpt2_model(model) elif model.config.model_type == "bert": model = _patch_bert_model(model) elif model.config.model_type == "vit": diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 2c74a4232..3d28350b8 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -18,8 +18,10 @@ import torch from torch import nn +from torch.nn import functional as F from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.models.gpt2.modeling_gpt2 import GPT2Block from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv from optimum.intel.utils.import_utils import is_ipex_version @@ -40,6 +42,7 @@ IndirectAccessKVCacheAttention, Linear2SiluMul, LinearAdd, + LinearAddAdd, LinearGelu, RotaryEmbedding, ) @@ -153,62 +156,40 @@ def _llama_model_forward( ) -# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L321 -class _IPEXLlamaAttention(nn.Module): +def _gpt2_block_forward(self, hidden_states, *args, **kwargs): + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + bsz, seq_len, _ = hidden_states.size() + layer_past = kwargs.get("layer_past", None) + past_len = layer_past[0].size(-2) if layer_past is not None else 0 + attention_mask = (1 - attention_mask / torch.finfo(attention_mask.dtype).min).squeeze(1, 2) + attention_mask = _prepare_4d_causal_attention_mask(attention_mask, (bsz, seq_len), hidden_states, past_len) + kwargs["attention_mask"] = attention_mask + + return GPT2Block.forward(self, hidden_states, *args, **kwargs) + + +class _IPEXAttention(nn.Module): def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) self.config = config - - if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]: - self.mha_linear_add = LinearAdd(module.o_proj) - del self.__dict__["_modules"]["o_proj"] - self.ipex_scale_dot_product = IndirectAccessKVCacheAttention( - text_max_length=module.config.max_position_embeddings - ) - self.ipex_rope = RotaryEmbedding( - module.config.max_position_embeddings, - module.config.hidden_size // module.config.num_attention_heads, - module.config.rope_theta, - module.config.architectures[0], - ) + self.ipex_scale_dot_product = IndirectAccessKVCacheAttention(text_max_length=config.max_position_embeddings) + if hasattr(config, "rope_theta"): + self.ipex_rope = RotaryEmbedding( + config.max_position_embeddings, + config.hidden_size // config.num_attention_heads, + config.rope_theta, + config.architectures[0], + ) def qkv_gemm(self, hidden_states): - bsz, seq_len, _ = hidden_states.size() - - query = self.q_proj(hidden_states) - key = self.k_proj(hidden_states) - value = self.v_proj(hidden_states) + raise NotImplementedError("Need to implement in specific model class") - query = query.view(bsz, seq_len, self.num_heads, self.head_dim) - key = key.view(bsz, seq_len, self.num_key_value_heads, self.head_dim) - value = value.view(bsz, seq_len, self.num_key_value_heads, self.head_dim) - - return query, key, value + def rope(self, *args, **kwargs): + raise NotImplementedError("Need to implement in specific model class") - def rope(self, query, key, kv_seq_len, position_ids, use_cache): - if use_cache: - key = self.ipex_rope( - key, - position_ids, - self.num_key_value_heads, - self.head_dim, - self.head_dim // 2, - self.head_dim, - kv_seq_len, - ) - query = self.ipex_rope( - query, - position_ids, - self.num_heads, - self.head_dim, - self.head_dim // 2, - self.head_dim, - kv_seq_len, - ) - return query, key - - def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, position_ids): + def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): # This ipex op pre-allocates buffers for past_key_values and use beam index history # which to decide which beam should be used to make attention scale dot more efficient. (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product( @@ -217,36 +198,21 @@ def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, pos value, math.sqrt(self.head_dim), past_key_value, - None, + kwargs.get("head_mask", None), attention_mask, + kwargs.get("alibi", None), ) return attn_output, past_key_value, attn_weights - # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L341 - def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, position_ids): - value_states = value.transpose(1, 2) - query_states = query.transpose(1, 2) - key_states = key.transpose(1, 2) - - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - past_key_value = None - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) + def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): + raise NotImplementedError("Need to implement in specific model class") - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attention_mask is not None: - attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask) - attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) + def prepare_attention_mask_float(self, attention_mask, *args): + return attention_mask - return attn_output, past_key_value, attn_weights + def postprocess_attention_output(self, attn_output, bsz, seq_len): + attn_output = attn_output.transpose(1, 2).reshape(bsz, seq_len, self.hidden_size) + return attn_output def forward( self, @@ -256,53 +222,148 @@ def forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - residual: Optional[torch.Tensor] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): - Attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, - query_sequence_length, key_sequence_length)` if default attention is used. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`. - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): - Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, - this tensor is not affected by padding. It is used to update the cache in the correct position and to infer - the complete sequence length. - residual (`torch.Tensor`): residual tensor to the layer of shape (batch, seq_len, embed_dim)` - """ + # For llama inputs: https://github.com/huggingface/transformers/blob/v4.43.4/src/transformers/models/llama/modeling_llama.py#L308 + # For falcon inputs: https://github.com/huggingface/transformers/blob/v4.43.4/src/transformers/models/falcon/modeling_falcon.py#L370 + if past_key_value is None and kwargs.get("layer_past", None) is not None: + past_key_value = kwargs.pop("layer_past", None) bsz, seq_len, _ = hidden_states.size() - kv_seq_len = seq_len + past_key_value[0].size(-2) if past_key_value is not None else seq_len + past_len = past_key_value[0].size(-2) if past_key_value is not None else 0 + kv_seq_len = seq_len + past_len - query, key, value = self.qkv_gemm(hidden_states) - query, key = self.rope(query, key, kv_seq_len, position_ids, use_cache) + qkv_out = self.qkv_gemm(hidden_states) + if isinstance(qkv_out, tuple) and len(qkv_out) == 3: + query, key, value = self.qkv_gemm(hidden_states) + query, key = self.rope(query, key, kv_seq_len, use_cache, position_ids=position_ids) + else: + query, key, value = self.rope(qkv_out, kv_seq_len, use_cache, past_len=past_len) + attention_mask = self.prepare_attention_mask_float(attention_mask, query.dtype) sdpa = self.sdpa_with_cache if use_cache else self.sdpa_without_cache attn_output, past_key_value, attn_weights = sdpa( - query, key, value, past_key_value, attention_mask, position_ids + query, + key, + value, + past_key_value, + attention_mask, + position_ids=position_ids, + head_mask=kwargs.get("head_mask", None), + alibi=kwargs.get("alibi", None), ) - attn_output = attn_output.transpose(1, 2).reshape(bsz, seq_len, self.hidden_size) - - if hasattr(self, "mha_linear_add"): - attn_output = self.mha_linear_add(attn_output, residual) - else: - attn_output = self.o_proj(attn_output) - attn_output = residual + attn_output + attn_output = self.postprocess_attention_output(attn_output, bsz, seq_len) if not output_attentions: attn_weights = None - return attn_output, attn_weights, past_key_value + return attn_output, past_key_value, attn_weights + + +class _IPEXLlamaAttention(_IPEXAttention): + def __init__(self, module, config) -> None: + super().__init__(module, config) + if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]: + self.mha_linear_add = LinearAdd(module.o_proj) + del self.__dict__["_modules"]["o_proj"] + + def qkv_gemm(self, hidden_states): + bsz, seq_len, _ = hidden_states.size() + query = self.q_proj(hidden_states).view(bsz, seq_len, self.num_heads, self.head_dim) + key = self.k_proj(hidden_states).view(bsz, seq_len, self.num_key_value_heads, self.head_dim) + value = self.v_proj(hidden_states).view(bsz, seq_len, self.num_key_value_heads, self.head_dim) + + return query, key, value + + def rope(self, query, key, kv_seq_len, use_cache, position_ids): + if use_cache: + args = (self.head_dim, self.head_dim // 2, self.head_dim, kv_seq_len) + key = self.ipex_rope(key, position_ids, self.num_key_value_heads, *args) + query = self.ipex_rope(query, position_ids, self.num_heads, *args) + return query, key + + # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L341 + def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, position_ids, **kwargs): + query, key, value = query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2) + cos, sin = self.rotary_emb(value, position_ids) + query, key = apply_rotary_pos_emb(query, key, cos, sin) + # repeat k/v heads if n_kv_heads < n_heads + key = repeat_kv(key, self.num_key_value_groups) + value = repeat_kv(value, self.num_key_value_groups) + attn_weights = torch.matmul(query, key.transpose(2, 3)) / math.sqrt(self.head_dim) + if attention_mask is not None: + attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask) + attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)) + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_output = torch.matmul(attn_weights, value) + + return attn_output, None, attn_weights + + +class _IPEXFalconAttention(_IPEXAttention): + def qkv_gemm(self, hidden_states): + return self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + + def rope(self, fused_qkv, seq_len, use_cache, past_len): + if use_cache: + query, key, value = self.ipex_rope( + fused_qkv, + torch.tensor(past_len), + self.num_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + seq_len, + 3, + ) + else: + (query, key, value) = self._split_heads(fused_qkv) + return query, key, value + + def prepare_attention_mask_float(self, attention_mask, dtype): + attention_mask_float = ( + (attention_mask * 1.0).masked_fill(attention_mask.to(torch.bool), float("-1e9")).to(dtype) + ) + return attention_mask_float + + def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): + bs, q_len = query.shape[0], query.shape[1] + query, key, value = query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2) + attn_output = F.scaled_dot_product_attention(query, key, value, attention_mask, 0.0, is_causal=False) + attn_output = attn_output.view(bs, self.num_heads, q_len, self.head_dim) + + return attn_output, None, None + + +class _IPEXGPT2Attention(_IPEXAttention): + def __init__(self, module, config) -> None: + super().__init__(module, config) + + def _split_heads_ipex(self, tensor, num_heads, attn_head_size): + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + return tensor.view(new_shape) # (batch, seq_length, head, head_features) + + def qkv_gemm(self, hidden_states): + query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2) + query = self._split_heads_ipex(query, self.num_heads, self.head_dim) + key = self._split_heads_ipex(key, self.num_heads, self.head_dim) + value = self._split_heads_ipex(value, self.num_heads, self.head_dim) + return query, key, value + + def rope(self, query, key, *args, **kwargs): + return query, key + + def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): + query, key, value = query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2) + attn_output = F.scaled_dot_product_attention(query, key, value, attention_mask, 0.0, is_causal=True) + + return attn_output, None, None + + def postprocess_attention_output(self, attn_output, bsz, seq_len): + attn_output = attn_output.transpose(1, 2).reshape(bsz, seq_len, self.embed_dim) + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + return attn_output # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L186 @@ -311,7 +372,6 @@ def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) self.config = config - # LinearAllreduce and LinearLayer cannot use fused op LinearAdd if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]: self.mlp_linear_add = LinearAdd(module.down_proj) @@ -321,11 +381,6 @@ def __init__(self, module, config) -> None: del self.__dict__["_modules"]["up_proj"] def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **kwargs): - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - residual (`torch.Tensor`): residual tensor to the layer of shape (batch, seq_len, embed_dim)` - """ if hasattr(self, "linear_silu_mul"): mlp_gate = self.linear_silu_mul(hidden_states) if hasattr(self, "mlp_linear_add"): @@ -340,69 +395,91 @@ def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, ** return hidden_states -# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694 -class _IPEXLlamaDecoderLayer(nn.Module): - def __init__(self, module, config): +class _IPEXFalconMLP(nn.Module): + def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) - self.self_attn = _IPEXLlamaAttention(module.self_attn, config) - self.mlp = _IPEXLlamaMLP(module.mlp, config) + self.config = config + # LinearAllreduce and LinearLayer cannot use fused op LinearAdd + self.linear_gelu = LinearGelu(module.dense_h_to_4h) + del self.__dict__["_modules"]["dense_h_to_4h"] + if module.dense_4h_to_h.__class__.__name__ not in ["LinearAllreduce"]: + self.linear_add_add = LinearAddAdd(module.dense_4h_to_h) + del self.__dict__["_modules"]["dense_4h_to_h"] def forward( self, hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, + attention_output: torch.Tensor = None, + residual: torch.Tensor = None, **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): - Attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, - query_sequence_length, key_sequence_length)` if default attention is used. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`. - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - """ + ): + mlp_hidden_states = self.linear_gelu(hidden_states) + if hasattr(self, "linear_add_add"): + output = self.linear_add_add(mlp_hidden_states, attention_output, residual) + else: + mlp_output = self.mlp.dense_4h_to_h(mlp_hidden_states) + output = mlp_output + attention_output + residual + + return output + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694 +class _IPEXLlamaDecoderLayer(nn.Module): + def __init__(self, module, config): + super().__init__() + _setattr_from_module(self, module) + self.self_attn = _IPEXLlamaAttention(module.self_attn, config) + self.mlp = _IPEXLlamaMLP(module.mlp, config) + def forward(self, hidden_states: torch.Tensor, **kwargs): + # Please see the original model's forward to check the parameter residual = hidden_states hidden_states = self.input_layernorm(hidden_states) - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=None, - residual=residual, - **kwargs, - ) + hidden_states, present, attn_weights = self.self_attn(hidden_states=hidden_states, **kwargs) + if hasattr(self.self_attn, "mha_linear_add"): + hidden_states = self.self_attn.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.o_proj(hidden_states) + hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states, residual, **kwargs) outputs = (hidden_states,) + if kwargs.get("output_attentions", False): + outputs += (attn_weights,) + if kwargs.get("use_cache", False): + outputs += (present,) - if output_attentions: - outputs += (self_attn_weights,) + return outputs - if use_cache: - outputs += (present_key_value,) + +class _IPEXFalconDecoderLayer(nn.Module): + def __init__(self, module, config): + super().__init__() + _setattr_from_module(self, module) + self.self_attention = _IPEXFalconAttention(module.self_attention, config) + self.mlp = _IPEXFalconMLP(module.mlp, config) + + def forward(self, hidden_states: torch.Tensor, **kwargs): + # Please see the original model's forward to check the parameter + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_output, present, attn_weights = self.self_attention(hidden_states, **kwargs) + attn_output = self.self_attention.dense(attn_output) + hidden_states = self.mlp(hidden_states, attn_output, residual) + + outputs = (hidden_states,) + if kwargs.get("output_attentions", False): + outputs += (attn_weights,) + if kwargs.get("use_cache", False): + outputs += (present,) return outputs diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index f2afd6535..77f804960 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -21,12 +21,17 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from requests.exceptions import ConnectionError as RequestsConnectionError from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase +from transformers.utils import is_torch_available from optimum.exporters import TasksManager from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.openvino.convert import export_from_model -from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version +from optimum.intel.utils.import_utils import ( + is_openvino_tokenizers_available, + is_openvino_version, + is_transformers_version, +) from optimum.utils.save_utils import maybe_load_preprocessors from .utils import clear_class_registry @@ -35,6 +40,11 @@ if TYPE_CHECKING: from optimum.intel.openvino.configuration import OVConfig + +if is_torch_available(): + import torch + + _COMPRESSION_OPTIONS = { "int8": {"bits": 8}, "int4_sym_g128": {"bits": 4, "sym": True, "group_size": 128}, @@ -100,6 +110,7 @@ def main_export( stateful: bool = True, convert_tokenizer: bool = False, library_name: Optional[str] = None, + model_loading_kwargs: Optional[Dict[str, Any]] = None, **kwargs_shapes, ): """ @@ -230,7 +241,8 @@ def main_export( do_gptq_patching = False custom_architecture = False - loading_kwargs = {} + patch_16bit = False + loading_kwargs = model_loading_kwargs or {} if library_name == "transformers": config = AutoConfig.from_pretrained( model_name_or_path, @@ -281,11 +293,30 @@ def main_export( "Please provide custom export config if you want load model with remote code." ) trust_remote_code = False + dtype = loading_kwargs.get("torch_dtype") + if isinstance(dtype, str): + dtype = config.torch_dtype if dtype == "auto" else getattr(torch, dtype) + if ( + dtype is None + and framework == "pt" + and not do_gptq_patching + and task.startswith("text-generation") + and getattr(config, "torch_dtype", torch.float32) in [torch.float16, torch.bfloat16] + ): + if ov_config is not None and ov_config.dtype in {"fp16", "fp32"}: + dtype = torch.float16 if ov_config.dtype == "fp16" else torch.float32 + elif is_openvino_version(">=", "2024.2") and config.torch_dtype == torch.float16: + dtype = torch.float16 + elif is_openvino_version(">=", "2024.3") and config.torch_dtype == torch.bfloat16: + dtype = torch.bfloat16 + + if dtype is not None: + if dtype in [torch.float16, torch.bfloat16]: + patch_16bit = True + loading_kwargs["torch_dtype"] = dtype # Patch the modules to export of GPTQ models w/o GPU if do_gptq_patching: - import torch - torch.set_default_dtype(torch.float32) orig_cuda_check = torch.cuda.is_available torch.cuda.is_available = lambda: True @@ -383,11 +414,12 @@ class StoreAttr(object): preprocessors=preprocessors, device=device, trust_remote_code=trust_remote_code, + patch_16bit_model=patch_16bit, **kwargs_shapes, ) if convert_tokenizer: - maybe_convert_tokenizers(library_name, output, model, preprocessors) + maybe_convert_tokenizers(library_name, output, model, preprocessors, task=task) clear_class_registry() del model @@ -399,7 +431,7 @@ class StoreAttr(object): GPTQQuantizer.post_init_model = orig_post_init_model -def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None): +def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None): """ Tries to convert tokenizers to OV format and export them to disk. @@ -412,6 +444,8 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro Model instance. preprocessors (`Iterable`, *optional*, defaults to None): Iterable possibly containing tokenizers to be converted. + task (`str`, *optional*, defaults to None): + The task to export the model for. Affects tokenizer conversion parameters. """ from optimum.exporters.openvino.convert import export_tokenizer @@ -420,7 +454,7 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None) if tokenizer: try: - export_tokenizer(tokenizer, output) + export_tokenizer(tokenizer, output, task=task) except Exception as exception: logger.warning( "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer " @@ -430,6 +464,6 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro for tokenizer_name in ("tokenizer", "tokenizer_2"): tokenizer = getattr(model, tokenizer_name, None) if tokenizer: - export_tokenizer(tokenizer, output / tokenizer_name) + export_tokenizer(tokenizer, output / tokenizer_name, task=task) else: logger.warning("Tokenizer won't be converted.") diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 83c031435..0b937734c 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -40,6 +40,7 @@ _timm_version, _torch_version, _transformers_version, + compare_versions, ) from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available from optimum.utils.save_utils import maybe_save_preprocessors @@ -104,6 +105,7 @@ def export( model_kwargs: Optional[Dict[str, Any]] = None, ov_config: Optional["OVConfig"] = None, stateful: bool = True, + patch_16bit_model: bool = False, ) -> Tuple[List[str], List[str]]: """ Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation. @@ -155,6 +157,7 @@ def export( ov_config=ov_config, model_kwargs=model_kwargs, stateful=stateful, + patch_16bit_model=patch_16bit_model, ) elif is_tf_available() and issubclass(type(model), TFPreTrainedModel): @@ -288,6 +291,7 @@ def export_pytorch( model_kwargs: Optional[Dict[str, Any]] = None, ov_config: Optional["OVConfig"] = None, stateful: bool = False, + patch_16bit_model: bool = False, ) -> Tuple[List[str], List[str]]: """ Exports a PyTorch model to an OpenVINO Intermediate Representation. @@ -380,6 +384,10 @@ def ts_patched_forward(*args, **kwargs): patcher.patched_forward = ts_patched_forward with patcher: + if patch_16bit_model: + from openvino.frontend.pytorch.patch_model import __make_16bit_traceable + + __make_16bit_traceable(model) check_dummy_inputs_are_allowed(model, dummy_inputs) sig = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.call) inputs = config.ordered_inputs(model) @@ -400,6 +408,13 @@ def ts_patched_forward(*args, **kwargs): "A stateless model will be exported instead. It may result in sub-optimal inference performance." "Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path." ) + + if patch_16bit_model: + from openvino.frontend.pytorch.patch_model import unpatch_model + + unpatch_model(model, "_openvino_module_extension_patch_orig_forward") + model.to(torch.float32) + return export_pytorch_via_onnx( model, config, @@ -466,6 +481,7 @@ def export_models( model_kwargs: Optional[Dict[str, Any]] = None, ov_config: Optional["OVConfig"] = None, stateful: bool = True, + patch_16bit_model: bool = False, ) -> Tuple[List[List[str]], List[List[str]]]: """ Export the models to OpenVINO IR format @@ -517,6 +533,7 @@ def export_models( model_kwargs=model_kwargs, ov_config=ov_config, stateful=stateful, + patch_16bit_model=patch_16bit_model, ) ) @@ -537,6 +554,7 @@ def export_from_model( preprocessors: List = None, device: str = "cpu", trust_remote_code: bool = False, + patch_16bit_model: bool = False, **kwargs_shapes, ): model_kwargs = model_kwargs or {} @@ -635,9 +653,9 @@ def export_from_model( if is_nncf_available(): from ...intel.openvino.configuration import OVConfig - ov_config = OVConfig(quantization_config={"bits": 8}) + ov_config = OVConfig(quantization_config={"bits": 8, "sym": False}) - logger.info("The model weights will be quantized to int8.") + logger.info("The model weights will be quantized to int8_asym.") else: logger.warning( "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf." @@ -699,6 +717,7 @@ def export_from_model( stateful=stateful, opset=opset, model_kwargs=model_kwargs, + patch_16bit_model=patch_16bit_model, ) @@ -706,6 +725,7 @@ def export_tokenizer( tokenizer, output: Union[str, Path], suffix: Optional[str] = "", + task: Optional[str] = None, ): # avoid circular imports from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME @@ -722,6 +742,15 @@ def export_tokenizer( if output.exists(): tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output) + if ( + task is not None + and task.startswith("text-generation") + and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0") + ): + logger.info(f"Set tokenizer padding side to left for `{task}` task.") + tokenizer.padding_side = "left" + tokenizer.truncation_side = "left" + try: converted = convert_tokenizer(tokenizer, with_detokenizer=True) except NotImplementedError: diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 7e82db16b..0ad38927a 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union from packaging import version +from transformers import PreTrainedModel, TFPreTrainedModel from transformers.utils import is_tf_available from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig @@ -23,6 +24,7 @@ CodeGenOnnxConfig, FalconOnnxConfig, GemmaOnnxConfig, + GPTNeoXOnnxConfig, LlamaOnnxConfig, MistralOnnxConfig, MPTOnnxConfig, @@ -31,6 +33,7 @@ VaeDecoderOnnxConfig, VaeEncoderOnnxConfig, ) +from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( @@ -50,6 +53,9 @@ ChatGLMModelPatcher, CodeGenModelPatcher, DBRXModelPatcher, + FalconModelPatcher, + GptNeoxJapaneseModelPatcher, + GptNeoxModelPatcher, InternLM2Patcher, InternLMModelPatcher, JaisModelPatcher, @@ -60,6 +66,7 @@ PersimmonModelPatcher, Phi3ModelPatcher, QwenModelPatcher, + RotaryEmbPatcher, UpdateCausalMaskModelPatcher, XverseModelPatcher, ) @@ -505,6 +512,12 @@ def patch_model_for_export( return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs) +def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None +) -> "ModelPatcher": + return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs) + + @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 @@ -632,6 +645,11 @@ class FalconOpenVINOConfig(FalconOnnxConfig): ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES DUMMY_PKV_GENERATOR_CLASS = OVFalconDummyPastKeyValuesGenerator + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return FalconModelPatcher(self, model, model_kwargs=model_kwargs) + @register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers") class UNetOpenVINOConfig(UNetOnnxConfig): @@ -725,6 +743,11 @@ class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return GptNeoxJapaneseModelPatcher(self, model, model_kwargs=model_kwargs) + @register_in_tasks_manager( "cohere", @@ -897,6 +920,44 @@ def patch_model_for_export( return ArcticModelPatcher(self, model, model_kwargs=model_kwargs) +class OVMistralDummyPastKeyValuesGenerator(MistralDummyPastKeyValuesGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + **kwargs, + ) + self.head_dim = getattr(normalized_config, "head_dim", self.hidden_size // self.num_attention_heads) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + shape = ( + self.batch_size, + self.num_key_value_heads, + self.sequence_length, + self.head_dim, + ) + return [ + ( + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] + + @register_in_tasks_manager( "mistral", *[ @@ -909,7 +970,30 @@ def patch_model_for_export( library_name="transformers", ) class MistralOpenVINOConfig(MistralOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + OVMistralDummyPastKeyValuesGenerator, + ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES + DUMMY_PKV_GENERATOR_CLASS = OVMistralDummyPastKeyValuesGenerator + def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return MistralModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "gpt-neox", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class GPTNeoxOpenVINOConfig(GPTNeoXOnnxConfig): + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index acc85d17b..6e65f4f11 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -108,6 +108,15 @@ def patch_update_causal_mask(model, transformers_version): inner_model._update_causal_mask = types.MethodType(_llama_gemma_update_causal_mask, inner_model) +# initialization of sin/cos cached in bf16/fp16 leads to accuracy loss +# reinitialize them to save in float32 before export +def _reinitialize_cos_sin_cached_fp32(rotary_emb): + if rotary_emb.cos_cached.dtype != torch.float32: + rotary_emb._set_cos_sin_cache( + seq_len=rotary_emb.max_position_embeddings, device=rotary_emb.inv_freq.device, dtype=torch.float32 + ) + + def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ """ batch_size, sequence_length, hidden_dim = hidden_states.shape @@ -158,6 +167,7 @@ def __enter__(self): layer.block_sparse_moe.forward = types.MethodType( _mixtral_sparse_moe_block_forward, layer.block_sparse_moe ) + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -689,6 +699,10 @@ def __enter__(self): self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask self._model.model._update_causal_mask = types.MethodType(_mistral_update_causal_mask, self._model.model) + else: + for layer in self._model.model.layers: + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2224,6 +2238,7 @@ def __enter__(self): orig_self_attn_fwd = layer.self_attn.forward layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn) layer.self_attn._orig_forward = orig_self_attn_fwd + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -2359,8 +2374,39 @@ class UpdateCausalMaskModelPatcher(DecoderModelPatcher): def __enter__(self): super().__enter__() patch_update_causal_mask(self._model, "4.42.0") + if hasattr(self._model.model.layers[0].self_attn.rotary_emb, "_set_cos_sin_cache"): + for layer in self._model.model.layers: + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) if hasattr(self._model.model, "_orig_update_causal_mask"): self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask + + +class RotaryEmbPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.model.layers: + _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb) + + +class FalconModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.transformer.h: + _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb) + + +class GptNeoxModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.gpt_neox.layers: + _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) + + +class GptNeoxJapaneseModelPatcher(DecoderModelPatcher): + def __enter__(self): + super().__enter__() + for layer in self._model.gpt_neox_japanese.layers: + _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index 8ca42b67a..d8132107a 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -200,10 +200,10 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model): """ key_value_input_names = [ - key.get_any_name() for key in ov_model.inputs if any("key_values" in key_name for key_name in key.get_names()) + key_name for key in ov_model.inputs for key_name in key.get_names() if "key_values" in key_name ] key_value_output_names = [ - key.get_any_name() for key in ov_model.outputs if any("present" in key_name for key_name in key.get_names()) + key_name for key in ov_model.outputs for key_name in key.get_names() if "present" in key_name ] not_kv_inputs = [ input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names()) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 618b4f4e3..30c29a2dc 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -65,7 +65,7 @@ logger = logging.getLogger(__name__) -_IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit") +_IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit", "falcon", "gpt2") _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation") @@ -213,7 +213,7 @@ def _from_pretrained( token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, - cache_dir: str = HUGGINGFACE_HUB_CACHE, + cache_dir: Union[str, Path] = HUGGINGFACE_HUB_CACHE, subfolder: str = "", local_files_only: bool = False, torch_dtype: Optional[Union[str, "torch.dtype"]] = None, @@ -221,6 +221,40 @@ def _from_pretrained( file_name: Optional[str] = WEIGHTS_NAME, **kwargs, ): + """ + Loads a model and its configuration file from a directory or the HF Hub. + + Arguments: + model_id (`str` or `Path`): + The directory from which to load the model. + Can be either: + - The model id of a pretrained model hosted inside a model repo on huggingface.co. + - The path to a directory containing the model weights. + use_auth_token (Optional[Union[bool, str]], defaults to `None`): + Deprecated. Please use `token` instead. + token (Optional[Union[bool, str]], defaults to `None`): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*): + The specific model version to use. It can be a branch name, a tag name, or a commit id. + force_download (`bool`, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + cache_dir (`Union[str, Path]`, *optional*): + The path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + subfolder (`str`, *optional*) + In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can specify the folder name here. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + torch_dtype (`Optional[Union[str, "torch.dtype"]]`, *optional*) + float16 or bfloat16 or float32: load in a specified dtype, ignoring the model config.torch_dtype if one exists. If not specified, the model will get loaded in float32. + trust_remote_code (`bool`, *optional*) + Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository. + file_name (`str`, *optional*): + The file name of the model to load. Overwrites the default file name and allows one to load the model + with a different name. + """ if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", @@ -496,7 +530,14 @@ def __init__( elif "_reorder_cache" in self.model_cls.__dict__: self._reorder_cache = self.model_cls._reorder_cache.__get__(self) - if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon", "mistral"}: + if is_transformers_version(">=", "4.38.0") and model_type in { + "llama", + "phi", + "persimmon", + "mistral", + "falcon", + "gpt2", + }: self.prepare_inputs_for_generation = _ipex_prepare_inputs_for_generation else: self.prepare_inputs_for_generation = self.model_cls.prepare_inputs_for_generation.__get__(self) @@ -515,8 +556,8 @@ def _prepare_past_key_values(self, input_ids): d_k = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads batch_size = input_ids.shape[0] - if model_type in {"mistral", "llama"}: - num_attention_heads = self.normalized_config.num_key_value_heads + if model_type in {"mistral", "llama", "falcon"}: + num_attention_heads = getattr(self.normalized_config, "num_key_value_heads", 1) else: num_attention_heads = self.normalized_config.num_attention_heads diff --git a/optimum/intel/ipex/utils.py b/optimum/intel/ipex/utils.py index 0cc9e6a97..2cc8cc2c6 100644 --- a/optimum/intel/ipex/utils.py +++ b/optimum/intel/ipex/utils.py @@ -14,9 +14,13 @@ _HEAD_TO_AUTOMODELS = { + "feature-extraction": "IPEXModel", "text-generation": "IPEXModelForCausalLM", "text-classification": "IPEXModelForSequenceClassification", "token-classification": "IPEXModelForTokenClassification", "question-answering": "IPEXModelForQuestionAnswering", "text2text-generation": "IPEXModelForSeq2SeqLM", + "fill-mask": "IPEXModelForMaskedLM", + "image-classification": "IPEXModelForImageClassification", + "audio-classification": "IPEXModelForAudioClassification", } diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index be59e0b4d..aaaca031b 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -41,7 +41,14 @@ class OVQuantizationMethod(str, Enum): _DEFAULT_4BIT_CONFIGS = { - "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, + "databricks/dolly-v2-3b": { + "bits": 4, + "sym": False, + "group_size": 128, + "ratio": 1.0, + "dataset": "wikitext2", + "scale_estimation": True, + }, "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64}, "facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8}, "togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128}, @@ -77,7 +84,14 @@ class OVQuantizationMethod(str, Enum): "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72}, "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6}, "openlm-research/open_llama_3b": {"bits": 4, "sym": False, "group_size": 64, "all_layers": True}, - "openlm-research/open_llama_3b_v2": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, + "openlm-research/open_llama_3b_v2": { + "bits": 4, + "sym": False, + "group_size": 64, + "ratio": 1.0, + "dataset": "wikitext2", + "quant_method": OVQuantizationMethod.AWQ, + }, "tiiuae/falcon-7b-instruct": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True}, "psmathur/orca_mini_3b": { "bits": 4, @@ -95,7 +109,14 @@ class OVQuantizationMethod(str, Enum): }, "mistralai/Mixtral-8x7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8}, "facebook/opt-2.7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7}, - "togethercomputer/RedPajama-INCITE-Chat-3B-v1": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, + "togethercomputer/RedPajama-INCITE-Chat-3B-v1": { + "bits": 4, + "sym": False, + "group_size": 128, + "ratio": 1.0, + "dataset": "wikitext2", + "scale_estimation": True, + }, "lmsys/vicuna-7b-v1.5": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0}, "stabilityai/stablelm-tuned-alpha-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, "mistralai/Mistral-7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.9}, @@ -107,10 +128,42 @@ class OVQuantizationMethod(str, Enum): "dataset": "wikitext2", "quant_method": OVQuantizationMethod.AWQ, }, - "lmsys/longchat-7b-16k": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.9}, + "lmsys/longchat-7b-16k": { + "bits": 4, + "sym": False, + "group_size": 128, + "ratio": 1.0, + "dataset": "wikitext2", + "quant_method": OVQuantizationMethod.AWQ, + "scale_estimation": True, + }, "bigcode/starcoder2-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.9}, - "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8}, - "microsoft/phi-2": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.9}, + "TinyLlama/TinyLlama-1.1B-Chat-v1.0": { + "bits": 4, + "sym": False, + "group_size": 64, + "ratio": 1.0, + "dataset": "wikitext2", + "quant_method": OVQuantizationMethod.AWQ, + "scale_estimation": True, + }, + "microsoft/phi-2": { + "bits": 4, + "sym": False, + "group_size": 64, + "ratio": 1.0, + "dataset": "wikitext2", + "quant_method": OVQuantizationMethod.AWQ, + "scale_estimation": True, + }, + "stabilityai/stablelm-tuned-alpha-7b": { + "bits": 4, + "sym": False, + "group_size": 64, + "ratio": 1.0, + "dataset": "wikitext2", + "scale_estimation": True, + }, } _DEFAULT_4BIT_CONFIG = { @@ -126,7 +179,8 @@ def _check_default_4bit_configs(model_id_or_path: str): if model_id_or_path in _DEFAULT_4BIT_CONFIGS: return _DEFAULT_4BIT_CONFIGS[model_id_or_path] - config_path = Path(model_id_or_path) / "config.json" + model_path = Path(model_id_or_path) + config_path = model_path / "config.json" if config_path.exists(): with config_path.open("r") as config_f: config = json.load(config_f) @@ -134,6 +188,11 @@ def _check_default_4bit_configs(model_id_or_path: str): if original_model_name in _DEFAULT_4BIT_CONFIGS: return _DEFAULT_4BIT_CONFIGS[original_model_name] + for model_id, config in _DEFAULT_4BIT_CONFIGS.items(): + short_id = model_id.split("/")[-1] + if model_path.name == short_id: + return config + return None @@ -221,11 +280,11 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. dataset (`str or List[str]`, *optional*): - The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset - in a list of strings or just use the one from the list ['wikitext2','c4','c4-new'] for language models - or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models. - Alternatively, you can provide data objects via `calibration_dataset` argument - of `OVQuantizer.quantize()` method. + The dataset used for data-aware compression with NNCF. For language models you can provide your own dataset + in a list of strings or just use the one from the list ['wikitext2','c4','c4-new']. For diffusion models it + must be one of ['conceptual_captions', 'laion/220k-GPT4Vision-captions-from-LIVIS', 'laion/filtered-wit']. + Alternatively, you can provide data objects via `calibration_dataset` argument of `OVQuantizer.quantize()` + method. ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 753c8f90d..554fdee7c 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -281,10 +281,17 @@ def _from_transformers( if load_in_8bit is None and not quantization_config: ov_export_config = None else: - ov_export_config = OVConfig(dtype="fp32") + ov_export_config = OVConfig(dtype="auto") stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache) + torch_dtype = kwargs.pop("torch_dtype", None) + + model_loading_kwargs = {} + + if torch_dtype is not None: + model_loading_kwargs["torch_dtype"] = torch_dtype + main_export( model_name_or_path=model_id, output=save_dir_path, @@ -298,6 +305,7 @@ def _from_transformers( trust_remote_code=trust_remote_code, ov_config=ov_export_config, stateful=stateful, + model_loading_kwargs=model_loading_kwargs, ) config.is_decoder = True diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index f038938bb..df9d496de 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -352,12 +352,14 @@ def _quantize_ovbasemodel( "quantization. Will rely on `calibration_dataset`." ) - if calibration_dataset is None and isinstance(quantization_config.dataset, str): + if calibration_dataset is None and quantization_config.dataset is not None: from optimum.intel import OVModelForCausalLM if isinstance(self.model, OVModelForCausalLM): - calibration_dataset = self._prepare_builtin_dataset(quantization_config) + calibration_dataset = self._prepare_causal_lm_dataset(quantization_config) elif is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + if not isinstance(quantization_config.dataset, str): + raise ValueError("Please provide dataset as one of the accepted dataset labels.") calibration_dataset = self._prepare_unet_dataset( quantization_config.num_samples, dataset_name=quantization_config.dataset ) @@ -374,10 +376,10 @@ def _quantize_ovbasemodel( quantization_config_copy = copy.deepcopy(quantization_config) quantization_config_copy.dataset = None quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT - for sd_submodel_name in ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"]: - sd_submodel = getattr(self.model, sd_submodel_name) - if sd_submodel is not None: - _weight_only_quantization(sd_submodel.model, quantization_config_copy) + sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"] + sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) + for sub_model in sub_models: + _weight_only_quantization(sub_model.model, quantization_config_copy) # Apply hybrid quantization to UNet self.model.unet.model = _hybrid_quantization( @@ -387,7 +389,13 @@ def _quantize_ovbasemodel( # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc. self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset) else: - _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) + if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase): + sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2", "unet"] + sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names)) + for sub_model in sub_models: + _weight_only_quantization(sub_model.model, quantization_config) + else: + _weight_only_quantization(self.model.model, quantization_config, calibration_dataset) if save_directory is not None: self.model.save_pretrained(save_directory) ov_config.save_pretrained(save_directory) @@ -670,14 +678,20 @@ def _remove_unused_columns(self, dataset: "Dataset"): ignored_columns = list(set(dataset.column_names) - set(self._signature_columns)) return dataset.remove_columns(ignored_columns) - def _prepare_builtin_dataset(self, quantization_config: OVWeightQuantizationConfig): + def _prepare_causal_lm_dataset(self, quantization_config: OVWeightQuantizationConfig): from optimum.gptq.data import get_dataset, prepare_dataset tokenizer = AutoTokenizer.from_pretrained( quantization_config.tokenizer, trust_remote_code=quantization_config.trust_remote_code ) nsamples = quantization_config.num_samples if quantization_config.num_samples else 128 - calibration_dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples) + config_dataset = quantization_config.dataset + if isinstance(config_dataset, str): + calibration_dataset = get_dataset(config_dataset, tokenizer, seqlen=32, nsamples=nsamples) + elif isinstance(config_dataset, list) and all(isinstance(it, str) for it in config_dataset): + calibration_dataset = [tokenizer(text, return_tensors="pt") for text in config_dataset[:nsamples]] + else: + raise ValueError("Please provide dataset as one of the accepted dataset labels or as a list of strings.") calibration_dataset = prepare_dataset(calibration_dataset) calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x)) diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py index 14e7275c5..5e321aacb 100644 --- a/optimum/intel/pipelines/pipeline_base.py +++ b/optimum/intel/pipelines/pipeline_base.py @@ -29,7 +29,6 @@ ImageToTextPipeline, Pipeline, PreTrainedTokenizer, - PreTrainedTokenizerFast, QuestionAnsweringPipeline, SummarizationPipeline, Text2TextGenerationPipeline, diff --git a/optimum/intel/utils/__init__.py b/optimum/intel/utils/__init__.py index d77588f89..50cdfa143 100644 --- a/optimum/intel/utils/__init__.py +++ b/optimum/intel/utils/__init__.py @@ -22,6 +22,7 @@ is_neural_compressor_available, is_neural_compressor_version, is_nncf_available, + is_numa_available, is_openvino_available, is_torch_version, is_transformers_available, diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 6be0aac47..032280e94 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -150,6 +150,14 @@ except importlib_metadata.PackageNotFoundError: _accelerate_available = False +_numa_available = importlib.util.find_spec("numa") is not None + +if _numa_available: + try: + importlib_metadata.version("numa") + except importlib_metadata.PackageNotFoundError: + _numa_available = False + def is_transformers_available(): return _transformers_available @@ -272,6 +280,10 @@ def is_accelerate_available(): return _accelerate_available +def is_numa_available(): + return _numa_available + + # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319 def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str): """ diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index 9b68266d1..1d2f7b03c 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -12,15 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +import math +import os +import platform import re from pathlib import Path from typing import List, Optional, Union +import psutil import torch from huggingface_hub import HfApi, HfFolder +from .import_utils import is_numa_available -MULTI_QUERY_ATTN_MODELS = {"falcon", "gpt_bigcode"} + +MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"} + +logger = logging.getLogger(__name__) def get_model_device(model: torch.nn.Module) -> torch.device: @@ -110,3 +119,101 @@ def _find_files_matching_pattern( files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) == subfolder] return files + + +def replace_customized_linear_with_linear(model): + """ + Replace custom linear to torch linear so ipex could recognize and replace them to ipex linear. + """ + if isinstance(model, torch.jit.ScriptModule): + return + if not model.training: + for child_name, child in model.named_children(): + if isinstance(child, torch.nn.Linear) and child.__class__.__name__ in [ + "FalconLinear", + "Linear", + ]: + new_m = torch.nn.Linear( + child.in_features, + child.out_features, + bias=False if child.bias is None else True, + ) + new_m.weight = child.weight + if child.bias is not None: + new_m.bias = child.bias + setattr(model, child_name, new_m) + else: + replace_customized_linear_with_linear(child) + + +def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" + for e in env_keys: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +def bind_cores_for_best_perf(): + """ + Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance. + Works for wold_size >= 1 and rank >= 1 + + Example: + .. code-block:: python + + from optimum.intel.ipex import IPEXModelForCausalLM + from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf + + bind_cores_for_best_perf() + model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True) + tokenizer = AutoTokenizer.from_pretrained("gpt2") + input_sentence = ["tell me a story about a trip to the moon"] + model_inputs = tokenizer(input_sentence, return_tensors="pt") + generation_kwargs = dict(max_new_tokens=500) + generated_ids = model.generate(**model_inputs, **generation_kwargs) + + Returns: + None + + """ + if platform.system() != "Linux": + logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.") + raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.") + if not is_numa_available(): + logger.error("'numa' module not found") + raise ImportError("'numa' module not found, install with 'pip install numa'") + import numa + + local_size = get_int_from_env( + ["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1 + ) + rank_id = get_int_from_env( + ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0 + ) + nodes = numa.get_max_node() + 1 + rank_per_node = math.ceil(local_size / nodes) + num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes) + node_id = int(rank_id / rank_per_node) + rank_offset_per_node = rank_id % rank_per_node + if os.getenv("OMP_NUM_THREADS") is None: + num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1) + logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance") + else: + num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS")) + logger.info(f"OMP_NUM_THREADS already set to {num_cpus_per_rank}") + if len(numa.get_membind()) == nodes: + # if numa memory binding is not set, set it to the node where the rank is running + numa.set_membind([node_id]) + + torch.set_num_threads(num_cpus_per_rank) + + if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True): + # if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank + cpu_start = num_cpus_per_rank * rank_offset_per_node + numa.set_affinity( + 0, + list(numa.node_to_cpus(node_id))[cpu_start : cpu_start + num_cpus_per_rank], + ) + logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}") diff --git a/setup.py b/setup.py index 59eadde37..e637f49e1 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,7 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor>=2.2.0", "accelerate", "transformers<4.43.0"], + "neural-compressor": ["neural-compressor>=2.2.0,<3.0", "accelerate", "transformers<4.43.0"], "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.11.0"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"], diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 7b042a4e0..01f935292 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -213,6 +213,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "blenderbot-small", "bloom", "codegen", + "falcon", "gpt2", "gpt_neo", "gpt_neox", @@ -220,10 +221,11 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "llama", "llama2", # "phi", + "distilgpt2", "mpt", "opt", ) - IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2",) + IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "distilgpt2", "falcon") GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.0 @@ -263,8 +265,9 @@ def test_compare_to_transformers(self, model_arch): # Compare tensor outputs self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) - self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) - self.assertTrue(torch.equal(outputs.logits, init_model_outputs.logits)) + # To avoid float pointing error + self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7)) + self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7)) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): @@ -281,7 +284,8 @@ def test_pipeline(self, model_arch): # High optimized model llama is not supported assisted decoding for now. @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_assisted_decoding(self, model_arch): - if model_arch == "llama2": + # Patched models are not support assisted decoding for now. + if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES: return model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -335,6 +339,24 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): self.assertIsInstance(outputs, torch.Tensor) self.assertTrue(torch.equal(outputs, transformers_outputs)) + @parameterized.expand(IPEX_PATCHED_SUPPORTED_ARCHITECTURES) + @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching") + def test_patched_model(self, model_arch): + model_id = MODEL_NAMES[model_arch] + patched_model_id = MODEL_NAMES["patched_" + model_arch] + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) + exported_model = IPEXModelForCausalLM.from_pretrained(patched_model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokens = tokenizer( + "This is a sample", + return_tensors="pt", + return_token_type_ids=False if model_arch in ("llama", "llama2") else None, + ) + inputs = ipex_model.prepare_inputs_for_generation(**tokens) + ipex_outputs = ipex_model(**inputs) + exported_outputs = exported_model(**inputs) + self.assertTrue(torch.allclose(ipex_outputs.logits, exported_outputs.logits, atol=1e-7)) + def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/tests/ipex/utils_tests.py b/tests/ipex/utils_tests.py index a14f0bf7c..595bc0246 100644 --- a/tests/ipex/utils_tests.py +++ b/tests/ipex/utils_tests.py @@ -25,8 +25,10 @@ "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM", "convnext": "hf-internal-testing/tiny-random-convnext", "distilbert": "hf-internal-testing/tiny-random-distilbert", + "distilgpt2": "Jiqing/tiny_random_distilgpt2", "electra": "hf-internal-testing/tiny-random-electra", "flaubert": "hf-internal-testing/tiny-random-flaubert", + "falcon": "Jiqing/tiny_random_falcon", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", @@ -54,4 +56,7 @@ "vit": "hf-internal-testing/tiny-random-vit", "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", "xlm": "hf-internal-testing/tiny-random-xlm", + "patched_falcon": "Jiqing/patched_tiny_random_falcon_for_causal_lm", + "patched_distilgpt2": "Jiqing/patched_tiny_random_distilgpt2_for_causal_lm", + "patched_llama2": "Jiqing/patched_tiny_random_llama2_for_causal_lm", } diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index 6d05158dd..ef20ed5a2 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -78,6 +78,7 @@ def _openvino_export( model_type: str, compression_option: Optional[str] = None, stateful: bool = True, + patch_16bit_model: bool = False, ): auto_model = self.SUPPORTED_ARCHITECTURES[model_type] task = auto_model.export_feature @@ -171,6 +172,32 @@ def test_export_with_custom_gen_config(self, model_type): self.assertIsInstance(ov_model.generation_config, GenerationConfig) self.assertTrue(ov_model.generation_config.top_k == 42) + def test_export_fp16_model(self): + auto_model = self.SUPPORTED_ARCHITECTURES["gpt2"] + task = auto_model.export_feature + model_name = MODEL_NAMES["gpt2"] + model = auto_model.auto_model_class.from_pretrained(model_name, torch_dtype=torch.float16) + stateful = True + + for supported_task in [task, task + "with-past"]: + with TemporaryDirectory() as tmpdirname: + export_from_model( + model=model, + output=Path(tmpdirname), + task=task, + preprocessors=None, + patch_16bit_model=True, + stateful=stateful, + ) + use_cache = supported_task.endswith("-with-past") + ov_model = auto_model.from_pretrained(tmpdirname, use_cache=use_cache) + self.assertIsInstance(ov_model, OVBaseModel) + self.assertEqual(ov_model.use_cache, use_cache) + self.assertEqual(ov_model.stateful, stateful and use_cache) + self.assertEqual( + ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version + ) + class CustomExportModelTest(unittest.TestCase): def test_custom_export_config_model(self): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 3186535a0..b5aff8d17 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -41,7 +41,10 @@ ) from optimum.intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS -from optimum.intel.utils.import_utils import is_openvino_tokenizers_available +from optimum.intel.utils.import_utils import ( + compare_versions, + is_openvino_tokenizers_available, +) class OVCLIExportTestCase(unittest.TestCase): @@ -171,6 +174,9 @@ def test_exporters_cli_tokenizers(self, task: str, model_type: str): if number_of_tokenizers == 1: self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output) + if task.startswith("text-generation") and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0"): + self.assertIn("Set tokenizer padding side to left", output) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_exporters_cli_fp16(self, task: str, model_type: str): with TemporaryDirectory() as tmpdir: diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index bb5024445..d71bbea45 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -657,6 +657,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "qwen2-moe", "arctic", "exaone", + "mistral-nemo", ) GENERATION_LENGTH = 100 @@ -1015,6 +1016,30 @@ def test_beam_search(self, model_arch): f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model_stateless output {ov_stateless_outputs}", ) + def test_load_with_different_dtype(self): + set_seed(SEED) + model_id = MODEL_NAMES["llama"] + pt_model = AutoModelForCausalLM.from_pretrained( + model_id, + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + + texts = ["this is a simple input"] + test_input = tokenizer(texts, return_tensors="pt") + + ref_logits = pt_model(**test_input).logits + torch_dtypes = [None, "auto", "float32", torch.float16] + if is_openvino_version(">", "2024.2.0"): + torch_dtypes.append("bfloat16") + + for dtype in torch_dtypes: + ov_model = OVModelForCausalLM.from_pretrained(model_id=model_id, export=True, torch_dtype=dtype) + ov_logits = ov_model(**test_input).logits + self.assertTrue( + torch.allclose(torch.Tensor(ov_logits), ref_logits, atol=5e-3), + f"values are not close for {dtype if dtype is not None else 'None'}, max diff = {torch.abs(ov_logits - ref_logits).max()}", + ) + class OVModelForMaskedLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index b488eaf71..23ff3a03c 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -223,6 +223,19 @@ class OVWeightCompressionTest(unittest.TestCase): ), 14, ), + ( + OVModelForCausalLM, + "opt", + dict( + bits=4, + sym=True, + group_size=-1, + ratio=0.8, + sensitivity_metric="mean_activation_magnitude", + dataset=["one two, " * i for i in range(10)], + ), + 14, + ), ( OVModelForCausalLM, "llama_awq", @@ -236,7 +249,7 @@ class OVWeightCompressionTest(unittest.TestCase): quant_method=QuantizationMethod.AWQ, scale_estimation=True, ), - 16, + 8, ), ( OVModelForCausalLM, @@ -250,7 +263,7 @@ class OVWeightCompressionTest(unittest.TestCase): dataset="c4", quant_method="awq", ), - 16, + 8, ), ) @@ -420,6 +433,18 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f model.save_pretrained(tmp_dir) + def test_stable_diffusion_with_weight_compression(self): + int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True) + quantization_config = OVWeightQuantizationConfig(bits=8, quant_method=OVQuantizationMethod.DEFAULT) + quantizer = OVQuantizer(int8_pipe) + + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) + + num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(int8_pipe.unet) + self.assertEqual(0, num_fake_quantize) + self.assertEqual(242, num_int8) + self.assertEqual(0, num_int4) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:]) def test_ovmodel_hybrid_quantization_with_custom_dataset( self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8 @@ -544,7 +569,7 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self): save_model_patch.assert_called_with( unittest.mock.ANY, unittest.mock.ANY, - ov_config=OVConfig(dtype="fp32"), + ov_config=OVConfig(dtype="auto"), library_name="transformers", ) @@ -567,7 +592,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self): save_model_patch.assert_called_with( unittest.mock.ANY, unittest.mock.ANY, - ov_config=OVConfig(dtype="fp32"), + ov_config=OVConfig(dtype="auto"), library_name="transformers", ) compression_params = { @@ -878,6 +903,13 @@ def test_named_default_configurations(self, config_id: str): value = prepared_config.__getattribute__(field_name) self.assertEqual(value, reference_value) + def test_for_no_short_id_duplicates(self): + short_ids = set() + for model_id in _DEFAULT_4BIT_CONFIGS.keys(): + short_id = model_id.split("/")[1] + assert short_id not in short_ids + short_ids.add(short_id) + class InferRequestWrapperTest(unittest.TestCase): MODEL_ID = ("openai/whisper-tiny.en",) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index e75f3ee62..f13723eef 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -80,6 +80,7 @@ "mbart": "hf-internal-testing/tiny-random-mbart", "minicpm": "katuni4ka/tiny-random-minicpm", "mistral": "echarlaix/tiny-random-mistral", + "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo", "mixtral": "TitanML/tiny-mixtral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192",