jiqing-feng · jiqing-feng · Jan 10, 2024 · Jan 12, 2024 · Jan 17, 2024 · Jan 18, 2024
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@
 
 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
 
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
+
 Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
 
 [OpenVINO](https://docs.openvino.ai/latest/index.html) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime.
@@ -17,6 +19,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi
 
 | Accelerator                                                                                                      | Installation                                                         |
 |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|
+| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction)                 | `pip install --upgrade-strategy eager "optimum[ipex]"`               |
 | [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`  |
 | [OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `pip install --upgrade-strategy eager "optimum[openvino,nncf]"`      |
 
@@ -37,10 +40,30 @@ or to install from source including dependencies:
 python -m pip install "optimum-intel[extras]"@git+https://github.com/huggingface/optimum-intel.git
 ```
 
-where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`.
+where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `nncf`.
 
 # Quick tour
 
+## Intel Extension for PyTorch
+To load a model and run generation with IPEX graph mode, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class.
+```diff
+import torch
+from transformers import AutoTokenizer, pipeline
+- from transformers import AutoModelForCausalLM
++ from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM
+
+
+model_id = "gpt2"
+- model = AutoModelForCausalLM.from_pretrained(model_id)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+print(text_generator("This is an example input"))
+```
+
+For now, we only support text-generation tasks.
+
 ## Neural Compressor
 
 Dynamic quantization can be used through the Optimum command-line interface:

diff --git a/examples/ipex/text-classification/run_classification.py b/examples/ipex/text-classification/run_classification.py
@@ -0,0 +1,11 @@
+import torch
+from transformers import AutoTokenizer, pipeline
+
+from optimum.intel.ipex.modeling_base import IPEXModelForSequenceClassification
+
+
+model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text_classifer = pipeline("text-classification", model=model, tokenizer=tokenizer)
+print(text_classifer("This movie is disgustingly good !"))
diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py
@@ -0,0 +1,12 @@
+import torch
+from transformers import AutoTokenizer, pipeline
+
+from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM
+
+
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+print(text_generator("This is an example input"))
diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py
@@ -102,6 +102,7 @@ def __init__(
         self.model_save_dir = model_save_dir
         self.preprocessors = kwargs.get("preprocessors", [])
         self.use_cache = use_cache
+        ## TO do: add XPU support
         self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
         self.model_dtype = kwargs.get("model_dtype", None)
@@ -255,6 +256,37 @@ def to(self, device: Union[torch.device, str]):
         self.model.to(self._device)
         return self
 
+    def prepare_past_key_values(self, input_ids):
+        model_type = self.config.model_type.replace("_", "-")
+        nb_pkv = 2
+        num_layers = self.normalized_config.num_layers
+        d_k = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads
+        batch_size = input_ids.shape[0]
+
+        if model_type in {"mistral", "llama"}:
+            num_attention_heads = self.normalized_config.num_key_value_heads
+        else:
+            num_attention_heads = self.normalized_config.num_attention_heads
+
+        if model_type == "bloom":
+            shape_key = (batch_size * num_attention_heads, d_k, 0)
+            shape_value = (batch_size * num_attention_heads, 0, d_k)
+            key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device)
+            value = torch.empty(size=shape_value, dtype=self.model_dtype, device=self._device)
+            past_key_values = tuple(
+                tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) for _ in range(num_layers)
+            )
+        elif model_type.replace("-", "_") in MULTI_QUERY_ATTN_MODELS:
+            shape = (batch_size, 0, d_k * 2)
+            pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device)
+            past_key_values = tuple(pkv for _ in range(num_layers))
+        else:
+            shape = (batch_size, num_attention_heads, 0, d_k)
+            pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device)
+            past_key_values = tuple(tuple(pkv for _ in range(nb_pkv)) for _ in range(num_layers))
+
+        return past_key_values
+
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -281,36 +313,9 @@ def forward(
         if "position_ids" in self.input_names or not self.input_names:
             inputs["position_ids"] = position_ids
 
-        model_type = self.config.model_type.replace("_", "-")
-
         if self.use_cache:
             if past_key_values is None:
-                nb_pkv = 2
-                num_layers = self.normalized_config.num_layers
-                d_k = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads
-                batch_size = input_ids.shape[0]
-
-                if model_type in {"mistral", "llama"}:
-                    num_attention_heads = self.normalized_config.num_key_value_heads
-                else:
-                    num_attention_heads = self.normalized_config.num_attention_heads
-
-                if model_type == "bloom":
-                    shape_key = (batch_size * num_attention_heads, d_k, 0)
-                    shape_value = (batch_size * num_attention_heads, 0, d_k)
-                    key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device)
-                    value = torch.empty(size=shape_value, dtype=self.model_dtype, device=self._device)
-                    past_key_values = tuple(
-                        tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) for _ in range(num_layers)
-                    )
-                elif model_type.replace("-", "_") in MULTI_QUERY_ATTN_MODELS:
-                    shape = (batch_size, 0, d_k * 2)
-                    pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device)
-                    past_key_values = tuple(pkv for _ in range(num_layers))
-                else:
-                    shape = (batch_size, num_attention_heads, 0, d_k)
-                    pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device)
-                    past_key_values = tuple(tuple(pkv for _ in range(nb_pkv)) for _ in range(num_layers))
+                past_key_values = self.prepare_past_key_values(input_ids)
 
             inputs["past_key_values"] = past_key_values
 

diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py
@@ -1 +1,4 @@
 from .inference import inference_mode
+
+
+generation_tasks = ("text-generation",)
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -0,0 +1,228 @@
+import logging
+import os
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Optional, Union
+
+import intel_extension_for_pytorch as ipex
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForSequenceClassification,
+    GenerationMixin,
+    PretrainedConfig,
+)
+from transformers.models.auto.auto_factory import _get_model_class
+from transformers.utils import WEIGHTS_NAME
+
+from optimum.exporters import TasksManager
+from optimum.modeling_base import OptimizedModel
+
+from ..generation.modeling import jit_trace
+from ..utils.import_utils import is_torch_version
+from ..utils.modeling_utils import patch_decoder_attention_mask
+from . import generation_tasks
+
+from .models.llama import LlamaForCausalLM
+
+SUPPORT_MODEL_LIST_FOR_CAUSAL_LM = {
+     "llama": LlamaForCausalLM
+}
+
+SUPPORT_TASK_LIST = {"text-generation": SUPPORT_MODEL_LIST_FOR_CAUSAL_LM}
+
+
+logger = logging.getLogger(__name__)
+
+
+class IPEXModel(OptimizedModel):
+    auto_model_class = AutoModel
+    export_feature = "feature-extraction"
+    base_model_prefix = "ipex_model"
+
+    def __init__(
+        self,
+        model,
+        config: PretrainedConfig = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        OptimizedModel.__init__(self, model=model, config=config)
+        # To do: add XPU support
+        self._device = torch.device("cpu")
+        self.model.to(self._device)
+
+        # Registers the IPEXModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
+        # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
+        AutoConfig.register(self.base_model_prefix, AutoConfig)
+        if hasattr(self.auto_model_class, "register"):
+            self.auto_model_class.register(AutoConfig, self.__class__)
+
+    @classmethod
+    def _from_transformers(
+        cls,
+        model_id: str,
+        config: PretrainedConfig,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        subfolder: str = "",
+        local_files_only: bool = False,
+        use_cache: bool = True,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        **kwargs,
+    ):
+        if is_torch_version("<", "2.1.0"):
+            raise ImportError("`torch>=2.0.0` is needed to trace your model")
+        task = cls.export_feature
+        model_kwargs = {
+            "revision": revision,
+            "use_auth_token": use_auth_token,
+            "cache_dir": cache_dir,
+            "subfolder": subfolder,
+            "local_files_only": local_files_only,
+            "force_download": force_download,
+            "use_cache": use_cache,
+            "torch_dtype": torch_dtype,
+        }
+        if task not in generation_tasks:
+            model_kwargs.pop("use_cache")
+        model_type = None
+        support_ipex_transformers = False
+        if task in SUPPORT_TASK_LIST.keys():
+            for name in SUPPORT_TASK_LIST[task].keys():
+                if name in model_id:
+                    support_ipex_transformers = True
+                    model_type = name
+                    break
+
+        if support_ipex_transformers and task in SUPPORT_TASK_LIST and model_type in SUPPORT_TASK_LIST[task]:
+            model = SUPPORT_TASK_LIST[task][model_type].from_pretrained(model_id, **model_kwargs)
+        else:
+            model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
+            model = patch_decoder_attention_mask(model)
+
+        if not support_ipex_transformers:
+            model = ipex.optimize(model, dtype=torch_dtype, level="O1", auto_kernel_selection=True)
+
+        if kwargs.pop("jit", True):
+            try:
+                traced_model = cls.apply_jit_optimize(model, task, use_cache, support_ipex_transformers)
+                save_dir = TemporaryDirectory()
+                save_dir_path = Path(save_dir.name)
+                torch.jit.save(traced_model, save_dir_path / WEIGHTS_NAME)
+                config.torchscript = True
+
+                return cls._from_pretrained(
+                    model_id=save_dir_path,
+                    config=config,
+                    use_cache=use_cache,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    force_download=force_download,
+                    cache_dir=cache_dir,
+                    local_files_only=local_files_only,
+                    model_dtype=torch_dtype,
+                    **kwargs,
+                )
+            except Exception as e:
+                logger.warning(f"failed to use PyTorch jit mode due to: {e}.")
+
+        return cls(
+            model,
+            config=config,
+            use_cache=use_cache,
+            model_dtype=torch_dtype,
+            **kwargs,
+        )
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        config: PretrainedConfig,
+        use_auth_token: Optional[Union[bool, str, None]] = None,
+        revision: Optional[Union[str, None]] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        file_name: Optional[str] = WEIGHTS_NAME,
+        local_files_only: bool = False,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        # Load the model from local directory
+        if os.path.isdir(model_id):
+            model_cache_path = os.path.join(model_id, file_name)
+            model_save_dir = model_id
+        # Download the model from the hub
+        else:
+            model_cache_path = hf_hub_download(
+                repo_id=model_id,
+                filename=file_name,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                local_files_only=local_files_only,
+            )
+            model_save_dir = Path(model_cache_path).parent
+
+        if getattr(config, "torchscript", False):
+            model = torch.jit.load(model_cache_path)
+            torch.jit.freeze(model.eval())
+        else:
+            model_class = _get_model_class(config, cls.auto_model_class._model_mapping)
+            model = model_class.from_pretrained(model_save_dir)
+
+        return cls(
+            model,
+            config=config,
+            model_save_dir=model_save_dir,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+    def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
+        if getattr(self.config, "torchscript", False):
+            torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME))
+        else:
+            torch.save(self.model, os.path.join(save_directory, WEIGHTS_NAME))
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def eval(self):
+        self.model.eval()
+        return self
+
+    @property
+    def device(self) -> torch.device:
+        return self._device
+
+    def to(self, device: Union[torch.device, str]):
+        self._device = device if isinstance(device, torch.device) else torch.device(device)
+        self.model.to(self._device)
+        return self
+
+    def can_generate(self):
+        return isinstance(self.model, GenerationMixin)
+
+    def generate(self, *args, **kwargs):
+        if not self.can_generate():
+            raise TypeError(
+                f"The current model class {self.model.__class__} is not compatible with `.generate()`, as it doesn't have a language model head."
+            )
+        return self.model.generate(*args, **kwargs)
+
+    @classmethod
+    def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers=False):
+        return jit_trace(model, task, use_cache)
+
+
+class IPEXModelForSequenceClassification(IPEXModel):
+    auto_model_class = AutoModelForSequenceClassification
+    export_feature = "text-classification"