From 13f28361c3982c46b83a9fb699ff6a0bce6b9d2d Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 14 Aug 2024 14:00:02 +0200
Subject: [PATCH 01/20] Allow to infer VAE in f16 precision (#859)

* Added custom ov_configs for vae encoder and decoder

* Simplify usage
---
 optimum/intel/openvino/modeling_diffusion.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 22b98d6c4..9b945caab 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -718,7 +718,7 @@ def __call__(self, latent_sample: np.ndarray):
         return list(outputs.values())
 
     def _compile(self):
-        if "GPU" in self._device:
+        if "GPU" in self._device and "INFERENCE_PRECISION_HINT" not in self.ov_config:
             self.ov_config.update({"INFERENCE_PRECISION_HINT": "f32"})
         super()._compile()
 
@@ -739,7 +739,7 @@ def __call__(self, sample: np.ndarray):
         return list(outputs.values())
 
     def _compile(self):
-        if "GPU" in self._device:
+        if "GPU" in self._device and "INFERENCE_PRECISION_HINT" not in self.ov_config:
             self.ov_config.update({"INFERENCE_PRECISION_HINT": "f32"})
         super()._compile()
 

From 79ec301f010324bd2badc409c2b16a41a34dd63b Mon Sep 17 00:00:00 2001
From: Sofya Balandina <sofya.balandina@intel.com>
Date: Wed, 14 Aug 2024 15:26:10 +0100
Subject: [PATCH 02/20] Fix data collision in caching in InferRequestWrapper
 (#864)

---
 optimum/intel/openvino/quantization.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 8262a1cff..f038938bb 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -132,9 +132,10 @@ def collect_inputs(self, inputs):
             data_hash = hash(data.tobytes())
 
             # Avoid data copying if tensor contains data encountered earlier
-            if data_hash not in self.tensor_cache:
-                self.tensor_cache[data_hash] = copy.deepcopy(v)
-            copied_inputs[k] = self.tensor_cache[data_hash]
+            self.tensor_cache.setdefault(k, {})
+            if data_hash not in self.tensor_cache[k]:
+                self.tensor_cache[k][data_hash] = copy.deepcopy(v)
+            copied_inputs[k] = self.tensor_cache[k][data_hash]
         self.collected_inputs.append(copied_inputs)
 
     def __call__(self, *args, **kwargs):

From 7b8eaa600b904c0ac3fab2853eb776000749665c Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 14 Aug 2024 18:45:42 +0400
Subject: [PATCH 03/20] support exaone model (#866)

* support exaone model

* docs
---
 docs/source/openvino/models.mdx             |  1 +
 optimum/exporters/openvino/model_configs.py | 15 +++++++++++++++
 optimum/exporters/openvino/model_patcher.py |  9 ++++++---
 tests/openvino/test_modeling.py             |  2 ++
 tests/openvino/utils_tests.py               |  1 +
 5 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index 83acd37ef..b82e68fe4 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -44,6 +44,7 @@ Here is the list of the supported architectures :
 - DistilBert
 - Electra
 - Encoder Decoder
+- Exaone
 - Falcon
 - Flaubert
 - GLM-4
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index b8aed025b..7e82db16b 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -357,6 +357,21 @@ def patch_model_for_export(
         return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+@register_in_tasks_manager(
+    "exaone",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class ExaoneOpenVINOConfig(LlamaOpenVINOConfig):
+    pass
+
+
 class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
     def __init__(
         self,
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 1e26efb8f..acc85d17b 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -103,7 +103,9 @@ def patch_model_with_bettertransformer(model):
 
 def patch_update_causal_mask(model, transformers_version):
     if is_transformers_version(">=", transformers_version):
-        model.model._update_causal_mask = types.MethodType(_llama_gemma_update_causal_mask, model.model)
+        inner_model = getattr(model, "model", getattr(model, "transformer", None))
+        if inner_model is not None:
+            inner_model._update_causal_mask = types.MethodType(_llama_gemma_update_causal_mask, inner_model)
 
 
 def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -563,8 +565,9 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
-        if hasattr(self._model.model, "_orig_update_causal_mask"):
-            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+        inner_model = getattr(self._model, "model", getattr(self._model, "transformer", None))
+        if hasattr(inner_model, "_orig_update_causal_mask"):
+            inner_model._update_causal_mask = inner_model._orig_update_causal_mask
 
 
 # copied from https://github.com/huggingface/transformers/commit/57d7594a79a9f5d835abf2d4d384db0e4818e548 to unblock export with transformers 4.42
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 612cacfcd..bb5024445 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -656,6 +656,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
             "qwen2",
             "qwen2-moe",
             "arctic",
+            "exaone",
         )
 
     GENERATION_LENGTH = 100
@@ -675,6 +676,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "codegen2",
         "arctic",
         "glm4",
+        "exaone",
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 1f9b051d7..e75f3ee62 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -52,6 +52,7 @@
     "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel",
     "detr": "hf-internal-testing/tiny-random-DetrModel",
     "electra": "hf-internal-testing/tiny-random-electra",
+    "exaone": "katuni4ka/tiny-random-exaone",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "falcon-40b": "katuni4ka/tiny-random-falcon-40b",

From 8cf3f266e70b7e98c3a8e671b725654110e36980 Mon Sep 17 00:00:00 2001
From: Artur Paniukov <chgk1101@gmail.com>
Date: Fri, 16 Aug 2024 11:24:28 +0400
Subject: [PATCH 04/20] [OpenVINO] Set Left Padding For Text Generation Task
 (#839)

* Set Left Padding For Text Gen Task

* Make Style

* Add OV Tokenizers Version Check

* Update tests/openvino/test_exporters_cli.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* Make Style

* Make Style

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 optimum/commands/export/openvino.py      |  4 ++--
 optimum/exporters/openvino/__main__.py   | 10 ++++++----
 optimum/exporters/openvino/convert.py    | 11 +++++++++++
 optimum/intel/pipelines/pipeline_base.py |  1 -
 tests/openvino/test_exporters_cli.py     |  8 +++++++-
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 3d8b44a56..139c7d0f8 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -305,7 +305,7 @@ def run(self):
             model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config)
             model.save_pretrained(self.args.output)
             if not self.args.disable_convert_tokenizer:
-                maybe_convert_tokenizers(library_name, self.args.output, model)
+                maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
         elif task.startswith("text-generation") and quantize_with_dataset:
             from optimum.intel import OVModelForCausalLM
 
@@ -324,7 +324,7 @@ def run(self):
                 preprocessors = maybe_load_preprocessors(
                     self.args.model, trust_remote_code=self.args.trust_remote_code
                 )
-                maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors)
+                maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task)
         else:
             # TODO : add input shapes
             main_export(
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index f2afd6535..9fe0de427 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -387,7 +387,7 @@ class StoreAttr(object):
     )
 
     if convert_tokenizer:
-        maybe_convert_tokenizers(library_name, output, model, preprocessors)
+        maybe_convert_tokenizers(library_name, output, model, preprocessors, task=task)
 
     clear_class_registry()
     del model
@@ -399,7 +399,7 @@ class StoreAttr(object):
         GPTQQuantizer.post_init_model = orig_post_init_model
 
 
-def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None):
+def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):
     """
     Tries to convert tokenizers to OV format and export them to disk.
 
@@ -412,6 +412,8 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
             Model instance.
         preprocessors (`Iterable`, *optional*, defaults to None):
             Iterable possibly containing tokenizers to be converted.
+        task (`str`, *optional*, defaults to None):
+            The task to export the model for. Affects tokenizer conversion parameters.
     """
     from optimum.exporters.openvino.convert import export_tokenizer
 
@@ -420,7 +422,7 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
             tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
             if tokenizer:
                 try:
-                    export_tokenizer(tokenizer, output)
+                    export_tokenizer(tokenizer, output, task=task)
                 except Exception as exception:
                     logger.warning(
                         "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
@@ -430,6 +432,6 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
             for tokenizer_name in ("tokenizer", "tokenizer_2"):
                 tokenizer = getattr(model, tokenizer_name, None)
                 if tokenizer:
-                    export_tokenizer(tokenizer, output / tokenizer_name)
+                    export_tokenizer(tokenizer, output / tokenizer_name, task=task)
     else:
         logger.warning("Tokenizer won't be converted.")
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 83c031435..ab76bab9d 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -40,6 +40,7 @@
     _timm_version,
     _torch_version,
     _transformers_version,
+    compare_versions,
 )
 from optimum.utils import DEFAULT_DUMMY_SHAPES, is_diffusers_available
 from optimum.utils.save_utils import maybe_save_preprocessors
@@ -706,6 +707,7 @@ def export_tokenizer(
     tokenizer,
     output: Union[str, Path],
     suffix: Optional[str] = "",
+    task: Optional[str] = None,
 ):
     # avoid circular imports
     from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
@@ -722,6 +724,15 @@ def export_tokenizer(
     if output.exists():
         tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output)
 
+    if (
+        task is not None
+        and task.startswith("text-generation")
+        and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0")
+    ):
+        logger.info(f"Set tokenizer padding side to left for `{task}` task.")
+        tokenizer.padding_side = "left"
+        tokenizer.truncation_side = "left"
+
     try:
         converted = convert_tokenizer(tokenizer, with_detokenizer=True)
     except NotImplementedError:
diff --git a/optimum/intel/pipelines/pipeline_base.py b/optimum/intel/pipelines/pipeline_base.py
index 39f48df27..ae3b5d3eb 100644
--- a/optimum/intel/pipelines/pipeline_base.py
+++ b/optimum/intel/pipelines/pipeline_base.py
@@ -29,7 +29,6 @@
     ImageToTextPipeline,
     Pipeline,
     PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
     QuestionAnsweringPipeline,
     SummarizationPipeline,
     Text2TextGenerationPipeline,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 3186535a0..b5aff8d17 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -41,7 +41,10 @@
 )
 from optimum.intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS
 from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
-from optimum.intel.utils.import_utils import is_openvino_tokenizers_available
+from optimum.intel.utils.import_utils import (
+    compare_versions,
+    is_openvino_tokenizers_available,
+)
 
 
 class OVCLIExportTestCase(unittest.TestCase):
@@ -171,6 +174,9 @@ def test_exporters_cli_tokenizers(self, task: str, model_type: str):
             if number_of_tokenizers == 1:
                 self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output)
 
+            if task.startswith("text-generation") and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0"):
+                self.assertIn("Set tokenizer padding side to left", output)
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_exporters_cli_fp16(self, task: str, model_type: str):
         with TemporaryDirectory() as tmpdir:

From 46f88586678e5d7936622017f73d8ac168df5fc0 Mon Sep 17 00:00:00 2001
From: Nikita Malinin <nikita.malinin@intel.com>
Date: Fri, 16 Aug 2024 14:14:12 +0200
Subject: [PATCH 05/20] Update default NNCF configurations (#869)

* Add several data-aware configurations

* Added tiny llama config

* phi-2 config update
---
 optimum/intel/openvino/configuration.py | 56 ++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index be59e0b4d..f99670ad3 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -41,7 +41,14 @@ class OVQuantizationMethod(str, Enum):
 
 
 _DEFAULT_4BIT_CONFIGS = {
-    "databricks/dolly-v2-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8},
+    "databricks/dolly-v2-3b": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 128,
+        "ratio": 1.0,
+        "dataset": "wikitext2",
+        "scale_estimation": True,
+    },
     "EleutherAI/gpt-j-6b": {"bits": 4, "sym": False, "group_size": 64},
     "facebook/opt-6.7b": {"bits": 4, "sym": False, "group_size": 64, "ratio": 0.8},
     "togethercomputer/RedPajama-INCITE-7B-Instruct": {"bits": 4, "sym": False, "group_size": 128},
@@ -95,7 +102,14 @@ class OVQuantizationMethod(str, Enum):
     },
     "mistralai/Mixtral-8x7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.8},
     "facebook/opt-2.7b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.7},
-    "togethercomputer/RedPajama-INCITE-Chat-3B-v1": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8},
+    "togethercomputer/RedPajama-INCITE-Chat-3B-v1": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 128,
+        "ratio": 1.0,
+        "dataset": "wikitext2",
+        "scale_estimation": True,
+    },
     "lmsys/vicuna-7b-v1.5": {"bits": 4, "sym": False, "group_size": 128, "ratio": 1.0},
     "stabilityai/stablelm-tuned-alpha-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8},
     "mistralai/Mistral-7B-v0.1": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.9},
@@ -107,10 +121,42 @@ class OVQuantizationMethod(str, Enum):
         "dataset": "wikitext2",
         "quant_method": OVQuantizationMethod.AWQ,
     },
-    "lmsys/longchat-7b-16k": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.9},
+    "lmsys/longchat-7b-16k": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 128,
+        "ratio": 1.0,
+        "dataset": "wikitext2",
+        "quant_method": OVQuantizationMethod.AWQ,
+        "scale_estimation": True,
+    },
     "bigcode/starcoder2-3b": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.9},
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.8},
-    "microsoft/phi-2": {"bits": 4, "sym": False, "group_size": 128, "ratio": 0.9},
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 64,
+        "ratio": 1.0,
+        "dataset": "wikitext2",
+        "quant_method": OVQuantizationMethod.AWQ,
+        "scale_estimation": True,
+    },
+    "microsoft/phi-2": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 64,
+        "ratio": 1.0,
+        "dataset": "wikitext2",
+        "quant_method": OVQuantizationMethod.AWQ,
+        "scale_estimation": True,
+    },
+    "stabilityai/stablelm-tuned-alpha-7b": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 64,
+        "ratio": 1.0,
+        "dataset": "wikitext2",
+        "scale_estimation": True,
+    },
 }
 
 _DEFAULT_4BIT_CONFIG = {

From 1b7bd9f9ae84d045f8e66fdb236b10974e3decbe Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 16 Aug 2024 15:44:31 +0200
Subject: [PATCH 06/20] Update compression config for
 openlm-research/open_llama_3b_v2 (#860)

* Remove compression with all_layers=True for openlm-research/open_llama_3b_v2

* Fix sym parameter

* Add AWQ
---
 optimum/intel/openvino/configuration.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index f99670ad3..5f47d4c75 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -84,7 +84,14 @@ class OVQuantizationMethod(str, Enum):
     "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72},
     "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
     "openlm-research/open_llama_3b": {"bits": 4, "sym": False, "group_size": 64, "all_layers": True},
-    "openlm-research/open_llama_3b_v2": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
+    "openlm-research/open_llama_3b_v2": {
+        "bits": 4,
+        "sym": False,
+        "group_size": 64,
+        "ratio": 1.0,
+        "dataset": "wikitext2",
+        "quant_method": OVQuantizationMethod.AWQ,
+    },
     "tiiuae/falcon-7b-instruct": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
     "psmathur/orca_mini_3b": {
         "bits": 4,

From 2737c5fb7f4aa42612e04c044df6aa3b723ea601 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 16 Aug 2024 15:55:19 +0200
Subject: [PATCH 07/20] Added a warning for missing --weight-format argument
 (#861)

* Added a warning when some compression parameters were provided, but --weight-format was not

* Remove map
---
 optimum/commands/export/openvino.py   | 36 ++++++++++++++++++---------
 optimum/exporters/openvino/convert.py |  4 +--
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 139c7d0f8..742612ca3 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -190,6 +190,24 @@ def parse_args_openvino(parser: "ArgumentParser"):
     )
 
 
+def no_compression_parameter_provided(args):
+    return all(
+        (
+            it is None
+            for it in (
+                args.ratio,
+                args.group_size,
+                args.sym,
+                args.all_layers,
+                args.dataset,
+                args.num_samples,
+                args.awq,
+                args.sensitivity_metric,
+            )
+        )
+    )
+
+
 class OVExportCommand(BaseOptimumCLICommand):
     COMMAND = CommandInfo(name="openvino", help="Export PyTorch models to OpenVINO IR.")
 
@@ -230,23 +248,17 @@ def run(self):
 
         if self.args.weight_format is None:
             ov_config = None
+            if not no_compression_parameter_provided(self.args):
+                logger.warning(
+                    "The provided compression parameters will not affect conversion because of the missing --weight-format argument."
+                )
         elif self.args.weight_format in {"fp16", "fp32"}:
             ov_config = OVConfig(dtype=self.args.weight_format)
         else:
             is_int8 = self.args.weight_format == "int8"
 
-            # For int4 quantization if not parameter is provided, then use the default config if exist
-            if (
-                not is_int8
-                and self.args.ratio is None
-                and self.args.group_size is None
-                and self.args.sym is None
-                and self.args.all_layers is None
-                and self.args.dataset is None
-                and self.args.num_samples is None
-                and self.args.awq is None
-                and self.args.sensitivity_metric is None
-            ):
+            # For int4 quantization if no parameter is provided, then use the default config if exist
+            if no_compression_parameter_provided(self.args) and not is_int8:
                 quantization_config = get_default_int4_config(self.args.model)
             else:
                 quantization_config = {
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index ab76bab9d..a051e0c43 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -636,9 +636,9 @@ def export_from_model(
             if is_nncf_available():
                 from ...intel.openvino.configuration import OVConfig
 
-                ov_config = OVConfig(quantization_config={"bits": 8})
+                ov_config = OVConfig(quantization_config={"bits": 8, "sym": False})
 
-                logger.info("The model weights will be quantized to int8.")
+                logger.info("The model weights will be quantized to int8_asym.")
             else:
                 logger.warning(
                     "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."

From 4dab3b67258cab65b71d41670e277bb1696044a2 Mon Sep 17 00:00:00 2001
From: Lyalyushkin Nikolay <nikolay.lyalyushkin@intel.com>
Date: Fri, 16 Aug 2024 17:12:40 +0200
Subject: [PATCH 08/20] Fixed weight compression mode for Stable Diffusion
 pipeline (#870)

* [OpenVINO] Fixed weight compression mode for Stable Diffusion pipeline

* Added unit test

* addressed comments
---
 optimum/intel/openvino/quantization.py | 16 +++++++++++-----
 tests/openvino/test_quantization.py    | 12 ++++++++++++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index f038938bb..cc1205e1a 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -374,10 +374,10 @@ def _quantize_ovbasemodel(
                     quantization_config_copy = copy.deepcopy(quantization_config)
                     quantization_config_copy.dataset = None
                     quantization_config_copy.quant_method = OVQuantizationMethod.DEFAULT
-                    for sd_submodel_name in ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"]:
-                        sd_submodel = getattr(self.model, sd_submodel_name)
-                        if sd_submodel is not None:
-                            _weight_only_quantization(sd_submodel.model, quantization_config_copy)
+                    sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2"]
+                    sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names))
+                    for sub_model in sub_models:
+                        _weight_only_quantization(sub_model.model, quantization_config_copy)
 
                     # Apply hybrid quantization to UNet
                     self.model.unet.model = _hybrid_quantization(
@@ -387,7 +387,13 @@ def _quantize_ovbasemodel(
                     # The model may be for example OVModelForImageClassification, OVModelForAudioClassification, etc.
                     self.model.model = _hybrid_quantization(self.model.model, quantization_config, calibration_dataset)
             else:
-                _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
+                if is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
+                    sub_model_names = ["vae_encoder", "vae_decoder", "text_encoder", "text_encoder_2", "unet"]
+                    sub_models = filter(lambda x: x, (getattr(self.model, name) for name in sub_model_names))
+                    for sub_model in sub_models:
+                        _weight_only_quantization(sub_model.model, quantization_config)
+                else:
+                    _weight_only_quantization(self.model.model, quantization_config, calibration_dataset)
             if save_directory is not None:
                 self.model.save_pretrained(save_directory)
                 ov_config.save_pretrained(save_directory)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index b488eaf71..d0c414e62 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -420,6 +420,18 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f
 
             model.save_pretrained(tmp_dir)
 
+    def test_stable_diffusion_with_weight_compression(self):
+        int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_NAMES["stable-diffusion"], export=True)
+        quantization_config = OVWeightQuantizationConfig(bits=8, quant_method=OVQuantizationMethod.DEFAULT)
+        quantizer = OVQuantizer(int8_pipe)
+
+        quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
+
+        num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(int8_pipe.unet)
+        self.assertEqual(0, num_fake_quantize)
+        self.assertEqual(242, num_int8)
+        self.assertEqual(0, num_int4)
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:])
     def test_ovmodel_hybrid_quantization_with_custom_dataset(
         self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8

From f6923266dad4d6857799755674e903ba76409b94 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 16 Aug 2024 17:40:53 +0200
Subject: [PATCH 09/20] Implement weight compression dataset preparation from
 list of strings (#867)

* Implement dataset preparation from list of strings

* Address comments

* Also add a check for SD dataset

* make style
---
 optimum/intel/openvino/configuration.py | 10 +++++-----
 optimum/intel/openvino/quantization.py  | 16 ++++++++++++----
 tests/openvino/test_quantization.py     | 13 +++++++++++++
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 5f47d4c75..59e87b21e 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -274,11 +274,11 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
                 - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                     using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
         dataset (`str or List[str]`, *optional*):
-            The dataset used for data-aware compression or quantization with NNCF. You can provide your own dataset
-            in a list of strings or just use the one from the list ['wikitext2','c4','c4-new'] for language models
-            or ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit'] for diffusion models.
-            Alternatively, you can provide data objects via `calibration_dataset` argument
-            of `OVQuantizer.quantize()` method.
+            The dataset used for data-aware compression with NNCF. For language models you can provide your own dataset
+            in a list of strings or just use the one from the list ['wikitext2','c4','c4-new']. For diffusion models it
+            must be one of ['conceptual_captions', 'laion/220k-GPT4Vision-captions-from-LIVIS', 'laion/filtered-wit'].
+            Alternatively, you can provide data objects via `calibration_dataset` argument of `OVQuantizer.quantize()`
+            method.
         ratio (`float`, defaults to 1.0):
             The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM
             and the rest to INT8_ASYM).
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index cc1205e1a..df9d496de 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -352,12 +352,14 @@ def _quantize_ovbasemodel(
                     "quantization. Will rely on `calibration_dataset`."
                 )
 
-            if calibration_dataset is None and isinstance(quantization_config.dataset, str):
+            if calibration_dataset is None and quantization_config.dataset is not None:
                 from optimum.intel import OVModelForCausalLM
 
                 if isinstance(self.model, OVModelForCausalLM):
-                    calibration_dataset = self._prepare_builtin_dataset(quantization_config)
+                    calibration_dataset = self._prepare_causal_lm_dataset(quantization_config)
                 elif is_diffusers_available() and isinstance(self.model, OVStableDiffusionPipelineBase):
+                    if not isinstance(quantization_config.dataset, str):
+                        raise ValueError("Please provide dataset as one of the accepted dataset labels.")
                     calibration_dataset = self._prepare_unet_dataset(
                         quantization_config.num_samples, dataset_name=quantization_config.dataset
                     )
@@ -676,14 +678,20 @@ def _remove_unused_columns(self, dataset: "Dataset"):
         ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
         return dataset.remove_columns(ignored_columns)
 
-    def _prepare_builtin_dataset(self, quantization_config: OVWeightQuantizationConfig):
+    def _prepare_causal_lm_dataset(self, quantization_config: OVWeightQuantizationConfig):
         from optimum.gptq.data import get_dataset, prepare_dataset
 
         tokenizer = AutoTokenizer.from_pretrained(
             quantization_config.tokenizer, trust_remote_code=quantization_config.trust_remote_code
         )
         nsamples = quantization_config.num_samples if quantization_config.num_samples else 128
-        calibration_dataset = get_dataset(quantization_config.dataset, tokenizer, seqlen=32, nsamples=nsamples)
+        config_dataset = quantization_config.dataset
+        if isinstance(config_dataset, str):
+            calibration_dataset = get_dataset(config_dataset, tokenizer, seqlen=32, nsamples=nsamples)
+        elif isinstance(config_dataset, list) and all(isinstance(it, str) for it in config_dataset):
+            calibration_dataset = [tokenizer(text, return_tensors="pt") for text in config_dataset[:nsamples]]
+        else:
+            raise ValueError("Please provide dataset as one of the accepted dataset labels or as a list of strings.")
         calibration_dataset = prepare_dataset(calibration_dataset)
         calibration_dataset = nncf.Dataset(calibration_dataset, lambda x: self.model.prepare_inputs(**x))
 
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index d0c414e62..2f92ea5b1 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -223,6 +223,19 @@ class OVWeightCompressionTest(unittest.TestCase):
             ),
             14,
         ),
+        (
+            OVModelForCausalLM,
+            "opt",
+            dict(
+                bits=4,
+                sym=True,
+                group_size=-1,
+                ratio=0.8,
+                sensitivity_metric="mean_activation_magnitude",
+                dataset=["one two, " * i for i in range(10)],
+            ),
+            14,
+        ),
         (
             OVModelForCausalLM,
             "llama_awq",

From ad1fe8b87f7a9efa095afdace142e909f28eeb67 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 19 Aug 2024 09:56:41 +0200
Subject: [PATCH 10/20] limit neural compressor version (#871)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 59eadde37..e637f49e1 100644
--- a/setup.py
+++ b/setup.py
@@ -59,7 +59,7 @@
 QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
 
 EXTRAS_REQUIRE = {
-    "neural-compressor": ["neural-compressor>=2.2.0", "accelerate", "transformers<4.43.0"],
+    "neural-compressor": ["neural-compressor>=2.2.0,<3.0", "accelerate", "transformers<4.43.0"],
     "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.11.0"],
     "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"],

From e9800ced0f6ceaa7aa0afe67327bfe348815620d Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Mon, 19 Aug 2024 15:14:29 +0400
Subject: [PATCH 11/20] [OV]: load and convert llms in original precision
 (#778)

* [OV]: load and convert llm in original precision

* unpatch for onnx

* add torch_dtype option for loading model

* fix rotary emb initialization

* fix patching order

* force precision using --weight-format

* fix quantization tests

* fix test

* move torch import
---
 optimum/exporters/openvino/__main__.py      | 42 +++++++++++++++++--
 optimum/exporters/openvino/convert.py       | 18 ++++++++
 optimum/exporters/openvino/model_configs.py | 41 ++++++++++++++++++
 optimum/exporters/openvino/model_patcher.py | 46 +++++++++++++++++++++
 optimum/intel/openvino/modeling_decoder.py  | 10 ++++-
 tests/openvino/test_export.py               | 27 ++++++++++++
 tests/openvino/test_modeling.py             | 24 +++++++++++
 tests/openvino/test_quantization.py         |  8 ++--
 8 files changed, 207 insertions(+), 9 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 9fe0de427..5b2b6a9a1 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -21,12 +21,17 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
+from transformers.utils import is_torch_available
 
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.openvino.convert import export_from_model
-from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
+from optimum.intel.utils.import_utils import (
+    is_openvino_tokenizers_available,
+    is_openvino_version,
+    is_transformers_version,
+)
 from optimum.utils.save_utils import maybe_load_preprocessors
 
 from .utils import clear_class_registry
@@ -35,6 +40,11 @@
 if TYPE_CHECKING:
     from optimum.intel.openvino.configuration import OVConfig
 
+
+if is_torch_available():
+    import torch
+
+
 _COMPRESSION_OPTIONS = {
     "int8": {"bits": 8},
     "int4_sym_g128": {"bits": 4, "sym": True, "group_size": 128},
@@ -100,6 +110,7 @@ def main_export(
     stateful: bool = True,
     convert_tokenizer: bool = False,
     library_name: Optional[str] = None,
+    model_loading_kwargs: Optional[Dict[str, Any]] = None,
     **kwargs_shapes,
 ):
     """
@@ -230,7 +241,8 @@ def main_export(
 
     do_gptq_patching = False
     custom_architecture = False
-    loading_kwargs = {}
+    patch_16bit = False
+    loading_kwargs = model_loading_kwargs or {}
     if library_name == "transformers":
         config = AutoConfig.from_pretrained(
             model_name_or_path,
@@ -281,11 +293,32 @@ def main_export(
                 "Please provide custom export config if you want load model with remote code."
             )
             trust_remote_code = False
+    dtype = loading_kwargs.get("torch_dtype")
+    if isinstance(dtype, str):
+        dtype = config.torch_dtype if dtype == "auto" else getattr(torch, dtype)
 
+    if (
+        dtype is None
+        and framework == "pt"
+        and not do_gptq_patching
+        and task.startswith("text-generation")
+        and getattr(config, "torch_dtype", torch.float32) in [torch.float16, torch.bfloat16]
+    ):
+        if ov_config is not None and ov_config.dtype in {"fp16", "fp32"}:
+            dtype = torch.float16 if ov_config.dtype == "fp16" else torch.float32
+        elif is_openvino_version(">=", "2024.2") and config.torch_dtype == torch.float16:
+            dtype = torch.float16
+        elif is_openvino_version(">=", "2024.3") and config.torch_dtype == torch.bfloat16:
+            dtype = torch.bfloat16
+
+    if dtype is not None:
+        if dtype in [torch.float16, torch.bfloat16]:
+            patch_16bit = True
+        loading_kwargs["torch_dtype"] = dtype
+
+    logger.warning(loading_kwargs)
     # Patch the modules to export of GPTQ models w/o GPU
     if do_gptq_patching:
-        import torch
-
         torch.set_default_dtype(torch.float32)
         orig_cuda_check = torch.cuda.is_available
         torch.cuda.is_available = lambda: True
@@ -383,6 +416,7 @@ class StoreAttr(object):
         preprocessors=preprocessors,
         device=device,
         trust_remote_code=trust_remote_code,
+        patch_16bit_model=patch_16bit,
         **kwargs_shapes,
     )
 
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index a051e0c43..0b937734c 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -105,6 +105,7 @@ def export(
     model_kwargs: Optional[Dict[str, Any]] = None,
     ov_config: Optional["OVConfig"] = None,
     stateful: bool = True,
+    patch_16bit_model: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation.
@@ -156,6 +157,7 @@ def export(
             ov_config=ov_config,
             model_kwargs=model_kwargs,
             stateful=stateful,
+            patch_16bit_model=patch_16bit_model,
         )
 
     elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
@@ -289,6 +291,7 @@ def export_pytorch(
     model_kwargs: Optional[Dict[str, Any]] = None,
     ov_config: Optional["OVConfig"] = None,
     stateful: bool = False,
+    patch_16bit_model: bool = False,
 ) -> Tuple[List[str], List[str]]:
     """
     Exports a PyTorch model to an OpenVINO Intermediate Representation.
@@ -381,6 +384,10 @@ def ts_patched_forward(*args, **kwargs):
             patcher.patched_forward = ts_patched_forward
 
             with patcher:
+                if patch_16bit_model:
+                    from openvino.frontend.pytorch.patch_model import __make_16bit_traceable
+
+                    __make_16bit_traceable(model)
                 check_dummy_inputs_are_allowed(model, dummy_inputs)
                 sig = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.call)
                 inputs = config.ordered_inputs(model)
@@ -401,6 +408,13 @@ def ts_patched_forward(*args, **kwargs):
                     "A stateless model will be exported instead. It may result in sub-optimal inference performance."
                     "Provide a model that can be converted to OpenVINO without fallback to ONNX conversion path."
                 )
+
+            if patch_16bit_model:
+                from openvino.frontend.pytorch.patch_model import unpatch_model
+
+                unpatch_model(model, "_openvino_module_extension_patch_orig_forward")
+                model.to(torch.float32)
+
             return export_pytorch_via_onnx(
                 model,
                 config,
@@ -467,6 +481,7 @@ def export_models(
     model_kwargs: Optional[Dict[str, Any]] = None,
     ov_config: Optional["OVConfig"] = None,
     stateful: bool = True,
+    patch_16bit_model: bool = False,
 ) -> Tuple[List[List[str]], List[List[str]]]:
     """
     Export the models to OpenVINO IR format
@@ -518,6 +533,7 @@ def export_models(
                 model_kwargs=model_kwargs,
                 ov_config=ov_config,
                 stateful=stateful,
+                patch_16bit_model=patch_16bit_model,
             )
         )
 
@@ -538,6 +554,7 @@ def export_from_model(
     preprocessors: List = None,
     device: str = "cpu",
     trust_remote_code: bool = False,
+    patch_16bit_model: bool = False,
     **kwargs_shapes,
 ):
     model_kwargs = model_kwargs or {}
@@ -700,6 +717,7 @@ def export_from_model(
         stateful=stateful,
         opset=opset,
         model_kwargs=model_kwargs,
+        patch_16bit_model=patch_16bit_model,
     )
 
 
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 7e82db16b..e25c6a4fb 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -16,6 +16,7 @@
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 
 from packaging import version
+from transformers import PreTrainedModel, TFPreTrainedModel
 from transformers.utils import is_tf_available
 
 from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig
@@ -23,6 +24,7 @@
     CodeGenOnnxConfig,
     FalconOnnxConfig,
     GemmaOnnxConfig,
+    GPTNeoXOnnxConfig,
     LlamaOnnxConfig,
     MistralOnnxConfig,
     MPTOnnxConfig,
@@ -31,6 +33,7 @@
     VaeDecoderOnnxConfig,
     VaeEncoderOnnxConfig,
 )
+from optimum.exporters.onnx.model_patcher import ModelPatcher
 from optimum.exporters.tasks import TasksManager
 from optimum.utils import DEFAULT_DUMMY_SHAPES
 from optimum.utils.input_generators import (
@@ -50,6 +53,9 @@
     ChatGLMModelPatcher,
     CodeGenModelPatcher,
     DBRXModelPatcher,
+    FalconModelPatcher,
+    GptNeoxJapaneseModelPatcher,
+    GptNeoxModelPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
     JaisModelPatcher,
@@ -60,6 +66,7 @@
     PersimmonModelPatcher,
     Phi3ModelPatcher,
     QwenModelPatcher,
+    RotaryEmbPatcher,
     UpdateCausalMaskModelPatcher,
     XverseModelPatcher,
 )
@@ -505,6 +512,12 @@ def patch_model_for_export(
         return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+def patch_model_for_export(
+    self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+) -> "ModelPatcher":
+    return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs)
+
+
 @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers")
 class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
@@ -632,6 +645,11 @@ class FalconOpenVINOConfig(FalconOnnxConfig):
     ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
     DUMMY_PKV_GENERATOR_CLASS = OVFalconDummyPastKeyValuesGenerator
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return FalconModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers")
 class UNetOpenVINOConfig(UNetOnnxConfig):
@@ -725,6 +743,11 @@ class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return GptNeoxJapaneseModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 @register_in_tasks_manager(
     "cohere",
@@ -913,3 +936,21 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return MistralModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "gpt-neox",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class GPTNeoxOpenVINOConfig(GPTNeoXOnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index acc85d17b..6e65f4f11 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -108,6 +108,15 @@ def patch_update_causal_mask(model, transformers_version):
             inner_model._update_causal_mask = types.MethodType(_llama_gemma_update_causal_mask, inner_model)
 
 
+# initialization of sin/cos cached in bf16/fp16 leads to accuracy loss
+# reinitialize them to save in float32 before export
+def _reinitialize_cos_sin_cached_fp32(rotary_emb):
+    if rotary_emb.cos_cached.dtype != torch.float32:
+        rotary_emb._set_cos_sin_cache(
+            seq_len=rotary_emb.max_position_embeddings, device=rotary_emb.inv_freq.device, dtype=torch.float32
+        )
+
+
 def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     """ """
     batch_size, sequence_length, hidden_dim = hidden_states.shape
@@ -158,6 +167,7 @@ def __enter__(self):
             layer.block_sparse_moe.forward = types.MethodType(
                 _mixtral_sparse_moe_block_forward, layer.block_sparse_moe
             )
+            _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -689,6 +699,10 @@ def __enter__(self):
             self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
             self._model.model._update_causal_mask = types.MethodType(_mistral_update_causal_mask, self._model.model)
 
+        else:
+            for layer in self._model.model.layers:
+                _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
+
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
 
@@ -2224,6 +2238,7 @@ def __enter__(self):
                 orig_self_attn_fwd = layer.self_attn.forward
                 layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn)
                 layer.self_attn._orig_forward = orig_self_attn_fwd
+            _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
@@ -2359,8 +2374,39 @@ class UpdateCausalMaskModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
         patch_update_causal_mask(self._model, "4.42.0")
+        if hasattr(self._model.model.layers[0].self_attn.rotary_emb, "_set_cos_sin_cache"):
+            for layer in self._model.model.layers:
+                _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         if hasattr(self._model.model, "_orig_update_causal_mask"):
             self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+
+
+class RotaryEmbPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.model.layers:
+            _reinitialize_cos_sin_cached_fp32(layer.self_attn.rotary_emb)
+
+
+class FalconModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.transformer.h:
+            _reinitialize_cos_sin_cached_fp32(layer.self_attention.rotary_emb)
+
+
+class GptNeoxModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.gpt_neox.layers:
+            _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+
+
+class GptNeoxJapaneseModelPatcher(DecoderModelPatcher):
+    def __enter__(self):
+        super().__enter__()
+        for layer in self._model.gpt_neox_japanese.layers:
+            _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 753c8f90d..554fdee7c 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -281,10 +281,17 @@ def _from_transformers(
         if load_in_8bit is None and not quantization_config:
             ov_export_config = None
         else:
-            ov_export_config = OVConfig(dtype="fp32")
+            ov_export_config = OVConfig(dtype="auto")
 
         stateful = kwargs.pop("stateful", ensure_stateful_is_available(warn=False) and use_cache)
 
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        model_loading_kwargs = {}
+
+        if torch_dtype is not None:
+            model_loading_kwargs["torch_dtype"] = torch_dtype
+
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -298,6 +305,7 @@ def _from_transformers(
             trust_remote_code=trust_remote_code,
             ov_config=ov_export_config,
             stateful=stateful,
+            model_loading_kwargs=model_loading_kwargs,
         )
 
         config.is_decoder = True
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index 6d05158dd..ef20ed5a2 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -78,6 +78,7 @@ def _openvino_export(
         model_type: str,
         compression_option: Optional[str] = None,
         stateful: bool = True,
+        patch_16bit_model: bool = False,
     ):
         auto_model = self.SUPPORTED_ARCHITECTURES[model_type]
         task = auto_model.export_feature
@@ -171,6 +172,32 @@ def test_export_with_custom_gen_config(self, model_type):
                     self.assertIsInstance(ov_model.generation_config, GenerationConfig)
                     self.assertTrue(ov_model.generation_config.top_k == 42)
 
+    def test_export_fp16_model(self):
+        auto_model = self.SUPPORTED_ARCHITECTURES["gpt2"]
+        task = auto_model.export_feature
+        model_name = MODEL_NAMES["gpt2"]
+        model = auto_model.auto_model_class.from_pretrained(model_name, torch_dtype=torch.float16)
+        stateful = True
+
+        for supported_task in [task, task + "with-past"]:
+            with TemporaryDirectory() as tmpdirname:
+                export_from_model(
+                    model=model,
+                    output=Path(tmpdirname),
+                    task=task,
+                    preprocessors=None,
+                    patch_16bit_model=True,
+                    stateful=stateful,
+                )
+                use_cache = supported_task.endswith("-with-past")
+                ov_model = auto_model.from_pretrained(tmpdirname, use_cache=use_cache)
+                self.assertIsInstance(ov_model, OVBaseModel)
+                self.assertEqual(ov_model.use_cache, use_cache)
+                self.assertEqual(ov_model.stateful, stateful and use_cache)
+                self.assertEqual(
+                    ov_model.model.get_rt_info()["optimum"]["transformers_version"], _transformers_version
+                )
+
 
 class CustomExportModelTest(unittest.TestCase):
     def test_custom_export_config_model(self):
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index bb5024445..35cab5026 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -1015,6 +1015,30 @@ def test_beam_search(self, model_arch):
                 f"generation config : {gen_config}, transformers output {transformers_outputs}, ov_model_stateless output {ov_stateless_outputs}",
             )
 
+    def test_load_with_different_dtype(self):
+        set_seed(SEED)
+        model_id = MODEL_NAMES["llama"]
+        pt_model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        texts = ["this is a simple input"]
+        test_input = tokenizer(texts, return_tensors="pt")
+
+        ref_logits = pt_model(**test_input).logits
+        torch_dtypes = [None, "auto", "float32", torch.float16]
+        if is_openvino_version(">", "2024.2.0"):
+            torch_dtypes.append("bfloat16")
+
+        for dtype in torch_dtypes:
+            ov_model = OVModelForCausalLM.from_pretrained(model_id=model_id, export=True, torch_dtype=dtype)
+            ov_logits = ov_model(**test_input).logits
+            self.assertTrue(
+                torch.allclose(torch.Tensor(ov_logits), ref_logits, atol=5e-3),
+                f"values are not close for {dtype if dtype is not None else 'None'}, max diff = {torch.abs(ov_logits - ref_logits).max()}",
+            )
+
 
 class OVModelForMaskedLMIntegrationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = (
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 2f92ea5b1..c789f364b 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -249,7 +249,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 quant_method=QuantizationMethod.AWQ,
                 scale_estimation=True,
             ),
-            16,
+            8,
         ),
         (
             OVModelForCausalLM,
@@ -263,7 +263,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 dataset="c4",
                 quant_method="awq",
             ),
-            16,
+            8,
         ),
     )
 
@@ -569,7 +569,7 @@ def test_ovmodel_load_large_model_with_uncompressed_weights(self):
                     save_model_patch.assert_called_with(
                         unittest.mock.ANY,
                         unittest.mock.ANY,
-                        ov_config=OVConfig(dtype="fp32"),
+                        ov_config=OVConfig(dtype="auto"),
                         library_name="transformers",
                     )
 
@@ -592,7 +592,7 @@ def test_ovmodel_load_large_model_with_additional_quantization_config(self):
                         save_model_patch.assert_called_with(
                             unittest.mock.ANY,
                             unittest.mock.ANY,
-                            ov_config=OVConfig(dtype="fp32"),
+                            ov_config=OVConfig(dtype="auto"),
                             library_name="transformers",
                         )
                         compression_params = {

From 860e09f8b286249fca9ed22fdbe90b719175dd7c Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Thu, 22 Aug 2024 10:42:35 +0200
Subject: [PATCH 12/20] Add INT4 config matching based on model folder name and
 model short id (#872)

* Add matching based on folder name and short id. Add test to catch duplicates.

* Switch to last dimension
---
 optimum/intel/openvino/configuration.py | 8 +++++++-
 tests/openvino/test_quantization.py     | 7 +++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 59e87b21e..aaaca031b 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -179,7 +179,8 @@ def _check_default_4bit_configs(model_id_or_path: str):
     if model_id_or_path in _DEFAULT_4BIT_CONFIGS:
         return _DEFAULT_4BIT_CONFIGS[model_id_or_path]
 
-    config_path = Path(model_id_or_path) / "config.json"
+    model_path = Path(model_id_or_path)
+    config_path = model_path / "config.json"
     if config_path.exists():
         with config_path.open("r") as config_f:
             config = json.load(config_f)
@@ -187,6 +188,11 @@ def _check_default_4bit_configs(model_id_or_path: str):
         if original_model_name in _DEFAULT_4BIT_CONFIGS:
             return _DEFAULT_4BIT_CONFIGS[original_model_name]
 
+    for model_id, config in _DEFAULT_4BIT_CONFIGS.items():
+        short_id = model_id.split("/")[-1]
+        if model_path.name == short_id:
+            return config
+
     return None
 
 
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index c789f364b..23ff3a03c 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -903,6 +903,13 @@ def test_named_default_configurations(self, config_id: str):
             value = prepared_config.__getattribute__(field_name)
             self.assertEqual(value, reference_value)
 
+    def test_for_no_short_id_duplicates(self):
+        short_ids = set()
+        for model_id in _DEFAULT_4BIT_CONFIGS.keys():
+            short_id = model_id.split("/")[1]
+            assert short_id not in short_ids
+            short_ids.add(short_id)
+
 
 class InferRequestWrapperTest(unittest.TestCase):
     MODEL_ID = ("openai/whisper-tiny.en",)

From 32d193de5987d20e14dc2013ca2a4684bf00a1db Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Thu, 22 Aug 2024 12:43:44 +0400
Subject: [PATCH 13/20] update mistral export config to support mistral nemo
 (#875)

* update mistral export config to support mistral nemo

* more robust fix and test
---
 optimum/exporters/openvino/__main__.py      |  2 -
 optimum/exporters/openvino/model_configs.py | 43 +++++++++++++++++++++
 tests/openvino/test_modeling.py             |  1 +
 tests/openvino/utils_tests.py               |  1 +
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 5b2b6a9a1..77f804960 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -315,8 +315,6 @@ def main_export(
         if dtype in [torch.float16, torch.bfloat16]:
             patch_16bit = True
         loading_kwargs["torch_dtype"] = dtype
-
-    logger.warning(loading_kwargs)
     # Patch the modules to export of GPTQ models w/o GPU
     if do_gptq_patching:
         torch.set_default_dtype(torch.float32)
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index e25c6a4fb..0ad38927a 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -920,6 +920,44 @@ def patch_model_for_export(
         return ArcticModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+class OVMistralDummyPastKeyValuesGenerator(MistralDummyPastKeyValuesGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            random_batch_size_range=random_batch_size_range,
+            random_sequence_length_range=random_sequence_length_range,
+            **kwargs,
+        )
+        self.head_dim = getattr(normalized_config, "head_dim", self.hidden_size // self.num_attention_heads)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        shape = (
+            self.batch_size,
+            self.num_key_value_heads,
+            self.sequence_length,
+            self.head_dim,
+        )
+        return [
+            (
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+                self.random_float_tensor(shape, framework=framework, dtype=float_dtype),
+            )
+            for _ in range(self.num_layers)
+        ]
+
+
 @register_in_tasks_manager(
     "mistral",
     *[
@@ -932,6 +970,11 @@ def patch_model_for_export(
     library_name="transformers",
 )
 class MistralOpenVINOConfig(MistralOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        OVMistralDummyPastKeyValuesGenerator,
+    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+    DUMMY_PKV_GENERATOR_CLASS = OVMistralDummyPastKeyValuesGenerator
+
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index 35cab5026..d71bbea45 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -657,6 +657,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
             "qwen2-moe",
             "arctic",
             "exaone",
+            "mistral-nemo",
         )
 
     GENERATION_LENGTH = 100
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index e75f3ee62..f13723eef 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -80,6 +80,7 @@
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "minicpm": "katuni4ka/tiny-random-minicpm",
     "mistral": "echarlaix/tiny-random-mistral",
+    "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo",
     "mixtral": "TitanML/tiny-mixtral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
     "mobilenet_v1": "google/mobilenet_v1_0.75_192",

From c177040c77cb561922bab8e8abf74f6804e7678a Mon Sep 17 00:00:00 2001
From: Pawel Raasz <pawel.raasz@intel.com>
Date: Thu, 22 Aug 2024 12:40:01 +0200
Subject: [PATCH 14/20] Correct get tensor name for stateful key, values (#874)

---
 optimum/exporters/openvino/stateful.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py
index 8ca42b67a..d8132107a 100644
--- a/optimum/exporters/openvino/stateful.py
+++ b/optimum/exporters/openvino/stateful.py
@@ -200,10 +200,10 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model):
     """
 
     key_value_input_names = [
-        key.get_any_name() for key in ov_model.inputs if any("key_values" in key_name for key_name in key.get_names())
+        key_name for key in ov_model.inputs for key_name in key.get_names() if "key_values" in key_name
     ]
     key_value_output_names = [
-        key.get_any_name() for key in ov_model.outputs if any("present" in key_name for key_name in key.get_names())
+        key_name for key in ov_model.outputs for key_name in key.get_names() if "present" in key_name
     ]
     not_kv_inputs = [
         input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())

From 1f3d0c2f004b6e373556de24edc790d12b49ec98 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Tue, 27 Aug 2024 01:49:39 +0800
Subject: [PATCH 15/20] Ipex patching for llama, falcon, gpt2 (#845)

* refactor llama and add falcon

* enable falcon

* rm arg explain

* Patch gpt2 (#10)

* add gpt2 patching

* add gpt2 tests

* fix replace linear

* add falcon tests and fix no cache forward

* fix format for modeling_utils

* rm llama name

* fix gpt2 split heads

* fix code style

* add hints and docstrings

* fix falcon low pytorch version tests

* add patched models tests

* fix comments

* Update tests/ipex/test_modeling.py

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>

* skip test patched model if ipex < 2.3

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 optimum/exporters/ipex/model_patcher.py  |  48 +++
 optimum/exporters/ipex/modeling_utils.py | 389 ++++++++++++++---------
 optimum/intel/ipex/modeling_base.py      |  15 +-
 optimum/intel/utils/modeling_utils.py    |  27 +-
 tests/ipex/test_modeling.py              |  30 +-
 tests/ipex/utils_tests.py                |   5 +
 6 files changed, 349 insertions(+), 165 deletions(-)

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index fb711d973..216c1c391 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -13,6 +13,8 @@
 #  limitations under the License.
 
 from transformers.models.bert.modeling_bert import BertIntermediate
+from transformers.models.falcon.modeling_falcon import FalconDecoderLayer, FalconForCausalLM
+from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2LMHeadModel
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaForCausalLM,
@@ -22,10 +24,14 @@
 from transformers.models.vit.modeling_vit import ViTIntermediate
 
 from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version
+from optimum.intel.utils.modeling_utils import replace_customized_linear_with_linear
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
+    _gpt2_block_forward,
     _ipex_rms_layer_norm_forward,
+    _IPEXFalconDecoderLayer,
+    _IPEXGPT2Attention,
     _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
     _llama_model_forward,
@@ -67,18 +73,56 @@ def patch_op(m, target_m, new_op_name, new_op):
 
 
 def _patch_llama_model(model):
+    """
+    Patch llama model:
+        1. Use IPEX Rope and IAKV cache
+        2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add)
+    """
     convert_functions(model, LlamaModel, "forward", _llama_model_forward)
     convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
     convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
     return model
 
 
+def _patch_falcon_model(model):
+    """
+    Patch falcon model:
+        1. Disable SDPA so the attention mask will be compatible to ipex attention.
+        2. Use IPEX Rope and IAKV cache
+        3. Linear fusion with (Linear + Gelu) and (Linear + Add + Add)
+    """
+    model.transformer._use_sdpa = False
+    replace_customized_linear_with_linear(model)
+    convert_class(model, FalconDecoderLayer, _IPEXFalconDecoderLayer, model.config)
+    return model
+
+
+def _patch_gpt2_model(model):
+    """
+    Patch gpt2 model:
+        1. Disable SDPA so the attention mask will be compatible to ipex attention.
+        2. Use IAKV cache
+    """
+    model.transformer._attn_implementation = "eager"
+    convert_class(model, GPT2Attention, _IPEXGPT2Attention, model.config)
+    convert_functions(model, GPT2Block, "forward", _gpt2_block_forward)
+    return model
+
+
 def _patch_bert_model(model):
+    """
+    Patch bert model:
+        1. Linear fusion with Linear + Gelu
+    """
     convert_class(model, BertIntermediate, _IPEXIntermediate)
     return model
 
 
 def _patch_vit_model(model):
+    """
+    Patch vit model:
+        1. Linear fusion with Linear + Gelu
+    """
     convert_class(model, ViTIntermediate, _IPEXIntermediate)
     return model
 
@@ -94,6 +138,10 @@ def _patch_model(model):
         )
     if isinstance(model, LlamaForCausalLM):
         model = _patch_llama_model(model)
+    elif isinstance(model, FalconForCausalLM):
+        model = _patch_falcon_model(model)
+    elif isinstance(model, GPT2LMHeadModel):
+        model = _patch_gpt2_model(model)
     elif model.config.model_type == "bert":
         model = _patch_bert_model(model)
     elif model.config.model_type == "vit":
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 2c74a4232..3d28350b8 100644
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -18,8 +18,10 @@
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.models.gpt2.modeling_gpt2 import GPT2Block
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
 
 from optimum.intel.utils.import_utils import is_ipex_version
@@ -40,6 +42,7 @@
         IndirectAccessKVCacheAttention,
         Linear2SiluMul,
         LinearAdd,
+        LinearAddAdd,
         LinearGelu,
         RotaryEmbedding,
     )
@@ -153,62 +156,40 @@ def _llama_model_forward(
     )
 
 
-# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L321
-class _IPEXLlamaAttention(nn.Module):
+def _gpt2_block_forward(self, hidden_states, *args, **kwargs):
+    attention_mask = kwargs.get("attention_mask", None)
+    if attention_mask is not None:
+        bsz, seq_len, _ = hidden_states.size()
+        layer_past = kwargs.get("layer_past", None)
+        past_len = layer_past[0].size(-2) if layer_past is not None else 0
+        attention_mask = (1 - attention_mask / torch.finfo(attention_mask.dtype).min).squeeze(1, 2)
+        attention_mask = _prepare_4d_causal_attention_mask(attention_mask, (bsz, seq_len), hidden_states, past_len)
+        kwargs["attention_mask"] = attention_mask
+
+    return GPT2Block.forward(self, hidden_states, *args, **kwargs)
+
+
+class _IPEXAttention(nn.Module):
     def __init__(self, module, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-
-        if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
-            self.mha_linear_add = LinearAdd(module.o_proj)
-            del self.__dict__["_modules"]["o_proj"]
-        self.ipex_scale_dot_product = IndirectAccessKVCacheAttention(
-            text_max_length=module.config.max_position_embeddings
-        )
-        self.ipex_rope = RotaryEmbedding(
-            module.config.max_position_embeddings,
-            module.config.hidden_size // module.config.num_attention_heads,
-            module.config.rope_theta,
-            module.config.architectures[0],
-        )
+        self.ipex_scale_dot_product = IndirectAccessKVCacheAttention(text_max_length=config.max_position_embeddings)
+        if hasattr(config, "rope_theta"):
+            self.ipex_rope = RotaryEmbedding(
+                config.max_position_embeddings,
+                config.hidden_size // config.num_attention_heads,
+                config.rope_theta,
+                config.architectures[0],
+            )
 
     def qkv_gemm(self, hidden_states):
-        bsz, seq_len, _ = hidden_states.size()
-
-        query = self.q_proj(hidden_states)
-        key = self.k_proj(hidden_states)
-        value = self.v_proj(hidden_states)
+        raise NotImplementedError("Need to implement in specific model class")
 
-        query = query.view(bsz, seq_len, self.num_heads, self.head_dim)
-        key = key.view(bsz, seq_len, self.num_key_value_heads, self.head_dim)
-        value = value.view(bsz, seq_len, self.num_key_value_heads, self.head_dim)
-
-        return query, key, value
+    def rope(self, *args, **kwargs):
+        raise NotImplementedError("Need to implement in specific model class")
 
-    def rope(self, query, key, kv_seq_len, position_ids, use_cache):
-        if use_cache:
-            key = self.ipex_rope(
-                key,
-                position_ids,
-                self.num_key_value_heads,
-                self.head_dim,
-                self.head_dim // 2,
-                self.head_dim,
-                kv_seq_len,
-            )
-            query = self.ipex_rope(
-                query,
-                position_ids,
-                self.num_heads,
-                self.head_dim,
-                self.head_dim // 2,
-                self.head_dim,
-                kv_seq_len,
-            )
-        return query, key
-
-    def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, position_ids):
+    def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, **kwargs):
         # This ipex op pre-allocates buffers for past_key_values and use beam index history
         # which to decide which beam should be used to make attention scale dot more efficient.
         (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product(
@@ -217,36 +198,21 @@ def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, pos
             value,
             math.sqrt(self.head_dim),
             past_key_value,
-            None,
+            kwargs.get("head_mask", None),
             attention_mask,
+            kwargs.get("alibi", None),
         )
         return attn_output, past_key_value, attn_weights
 
-    # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L341
-    def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, position_ids):
-        value_states = value.transpose(1, 2)
-        query_states = query.transpose(1, 2)
-        key_states = key.transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        past_key_value = None
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
+    def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs):
+        raise NotImplementedError("Need to implement in specific model class")
 
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:
-            attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask)
-            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
+    def prepare_attention_mask_float(self, attention_mask, *args):
+        return attention_mask
 
-        return attn_output, past_key_value, attn_weights
+    def postprocess_attention_output(self, attn_output, bsz, seq_len):
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, seq_len, self.hidden_size)
+        return attn_output
 
     def forward(
         self,
@@ -256,53 +222,148 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        residual: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                Attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-                Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-                this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-                the complete sequence length.
-            residual (`torch.Tensor`): residual tensor to the layer of shape (batch, seq_len, embed_dim)`
-        """
+        # For llama inputs: https://github.com/huggingface/transformers/blob/v4.43.4/src/transformers/models/llama/modeling_llama.py#L308
+        # For falcon inputs: https://github.com/huggingface/transformers/blob/v4.43.4/src/transformers/models/falcon/modeling_falcon.py#L370
+        if past_key_value is None and kwargs.get("layer_past", None) is not None:
+            past_key_value = kwargs.pop("layer_past", None)
         bsz, seq_len, _ = hidden_states.size()
-        kv_seq_len = seq_len + past_key_value[0].size(-2) if past_key_value is not None else seq_len
+        past_len = past_key_value[0].size(-2) if past_key_value is not None else 0
+        kv_seq_len = seq_len + past_len
 
-        query, key, value = self.qkv_gemm(hidden_states)
-        query, key = self.rope(query, key, kv_seq_len, position_ids, use_cache)
+        qkv_out = self.qkv_gemm(hidden_states)
+        if isinstance(qkv_out, tuple) and len(qkv_out) == 3:
+            query, key, value = self.qkv_gemm(hidden_states)
+            query, key = self.rope(query, key, kv_seq_len, use_cache, position_ids=position_ids)
+        else:
+            query, key, value = self.rope(qkv_out, kv_seq_len, use_cache, past_len=past_len)
 
+        attention_mask = self.prepare_attention_mask_float(attention_mask, query.dtype)
         sdpa = self.sdpa_with_cache if use_cache else self.sdpa_without_cache
         attn_output, past_key_value, attn_weights = sdpa(
-            query, key, value, past_key_value, attention_mask, position_ids
+            query,
+            key,
+            value,
+            past_key_value,
+            attention_mask,
+            position_ids=position_ids,
+            head_mask=kwargs.get("head_mask", None),
+            alibi=kwargs.get("alibi", None),
         )
-        attn_output = attn_output.transpose(1, 2).reshape(bsz, seq_len, self.hidden_size)
-
-        if hasattr(self, "mha_linear_add"):
-            attn_output = self.mha_linear_add(attn_output, residual)
-        else:
-            attn_output = self.o_proj(attn_output)
-            attn_output = residual + attn_output
+        attn_output = self.postprocess_attention_output(attn_output, bsz, seq_len)
 
         if not output_attentions:
             attn_weights = None
 
-        return attn_output, attn_weights, past_key_value
+        return attn_output, past_key_value, attn_weights
+
+
+class _IPEXLlamaAttention(_IPEXAttention):
+    def __init__(self, module, config) -> None:
+        super().__init__(module, config)
+        if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
+            self.mha_linear_add = LinearAdd(module.o_proj)
+            del self.__dict__["_modules"]["o_proj"]
+
+    def qkv_gemm(self, hidden_states):
+        bsz, seq_len, _ = hidden_states.size()
+        query = self.q_proj(hidden_states).view(bsz, seq_len, self.num_heads, self.head_dim)
+        key = self.k_proj(hidden_states).view(bsz, seq_len, self.num_key_value_heads, self.head_dim)
+        value = self.v_proj(hidden_states).view(bsz, seq_len, self.num_key_value_heads, self.head_dim)
+
+        return query, key, value
+
+    def rope(self, query, key, kv_seq_len, use_cache, position_ids):
+        if use_cache:
+            args = (self.head_dim, self.head_dim // 2, self.head_dim, kv_seq_len)
+            key = self.ipex_rope(key, position_ids, self.num_key_value_heads, *args)
+            query = self.ipex_rope(query, position_ids, self.num_heads, *args)
+        return query, key
+
+    # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L341
+    def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, position_ids, **kwargs):
+        query, key, value = query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2)
+        cos, sin = self.rotary_emb(value, position_ids)
+        query, key = apply_rotary_pos_emb(query, key, cos, sin)
+        # repeat k/v heads if n_kv_heads < n_heads
+        key = repeat_kv(key, self.num_key_value_groups)
+        value = repeat_kv(value, self.num_key_value_groups)
+        attn_weights = torch.matmul(query, key.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask)
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, None, attn_weights
+
+
+class _IPEXFalconAttention(_IPEXAttention):
+    def qkv_gemm(self, hidden_states):
+        return self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+
+    def rope(self, fused_qkv, seq_len, use_cache, past_len):
+        if use_cache:
+            query, key, value = self.ipex_rope(
+                fused_qkv,
+                torch.tensor(past_len),
+                self.num_heads,
+                self.head_dim,
+                self.head_dim // 2,
+                self.head_dim,
+                seq_len,
+                3,
+            )
+        else:
+            (query, key, value) = self._split_heads(fused_qkv)
+        return query, key, value
+
+    def prepare_attention_mask_float(self, attention_mask, dtype):
+        attention_mask_float = (
+            (attention_mask * 1.0).masked_fill(attention_mask.to(torch.bool), float("-1e9")).to(dtype)
+        )
+        return attention_mask_float
+
+    def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs):
+        bs, q_len = query.shape[0], query.shape[1]
+        query, key, value = query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2)
+        attn_output = F.scaled_dot_product_attention(query, key, value, attention_mask, 0.0, is_causal=False)
+        attn_output = attn_output.view(bs, self.num_heads, q_len, self.head_dim)
+
+        return attn_output, None, None
+
+
+class _IPEXGPT2Attention(_IPEXAttention):
+    def __init__(self, module, config) -> None:
+        super().__init__(module, config)
+
+    def _split_heads_ipex(self, tensor, num_heads, attn_head_size):
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        return tensor.view(new_shape)  # (batch, seq_length, head, head_features)
+
+    def qkv_gemm(self, hidden_states):
+        query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+        query = self._split_heads_ipex(query, self.num_heads, self.head_dim)
+        key = self._split_heads_ipex(key, self.num_heads, self.head_dim)
+        value = self._split_heads_ipex(value, self.num_heads, self.head_dim)
+        return query, key, value
+
+    def rope(self, query, key, *args, **kwargs):
+        return query, key
+
+    def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs):
+        query, key, value = query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2)
+        attn_output = F.scaled_dot_product_attention(query, key, value, attention_mask, 0.0, is_causal=True)
+
+        return attn_output, None, None
+
+    def postprocess_attention_output(self, attn_output, bsz, seq_len):
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, seq_len, self.embed_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L186
@@ -311,7 +372,6 @@ def __init__(self, module, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-
         # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
         if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
             self.mlp_linear_add = LinearAdd(module.down_proj)
@@ -321,11 +381,6 @@ def __init__(self, module, config) -> None:
         del self.__dict__["_modules"]["up_proj"]
 
     def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **kwargs):
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            residual (`torch.Tensor`): residual tensor to the layer of shape (batch, seq_len, embed_dim)`
-        """
         if hasattr(self, "linear_silu_mul"):
             mlp_gate = self.linear_silu_mul(hidden_states)
             if hasattr(self, "mlp_linear_add"):
@@ -340,69 +395,91 @@ def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **
         return hidden_states
 
 
-# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694
-class _IPEXLlamaDecoderLayer(nn.Module):
-    def __init__(self, module, config):
+class _IPEXFalconMLP(nn.Module):
+    def __init__(self, module, config) -> None:
         super().__init__()
         _setattr_from_module(self, module)
-        self.self_attn = _IPEXLlamaAttention(module.self_attn, config)
-        self.mlp = _IPEXLlamaMLP(module.mlp, config)
+        self.config = config
+        # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
+        self.linear_gelu = LinearGelu(module.dense_h_to_4h)
+        del self.__dict__["_modules"]["dense_h_to_4h"]
+        if module.dense_4h_to_h.__class__.__name__ not in ["LinearAllreduce"]:
+            self.linear_add_add = LinearAddAdd(module.dense_4h_to_h)
+            del self.__dict__["_modules"]["dense_4h_to_h"]
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
+        attention_output: torch.Tensor = None,
+        residual: torch.Tensor = None,
         **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                Attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-        """
+    ):
+        mlp_hidden_states = self.linear_gelu(hidden_states)
+        if hasattr(self, "linear_add_add"):
+            output = self.linear_add_add(mlp_hidden_states, attention_output, residual)
+        else:
+            mlp_output = self.mlp.dense_4h_to_h(mlp_hidden_states)
+            output = mlp_output + attention_output + residual
+
+        return output
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694
+class _IPEXLlamaDecoderLayer(nn.Module):
+    def __init__(self, module, config):
+        super().__init__()
+        _setattr_from_module(self, module)
+        self.self_attn = _IPEXLlamaAttention(module.self_attn, config)
+        self.mlp = _IPEXLlamaMLP(module.mlp, config)
 
+    def forward(self, hidden_states: torch.Tensor, **kwargs):
+        # Please see the original model's forward to check the parameter
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-
         # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=None,
-            residual=residual,
-            **kwargs,
-        )
+        hidden_states, present, attn_weights = self.self_attn(hidden_states=hidden_states, **kwargs)
 
+        if hasattr(self.self_attn, "mha_linear_add"):
+            hidden_states = self.self_attn.mha_linear_add(hidden_states, residual)
+        else:
+            hidden_states = self.self_attn.o_proj(hidden_states)
+            hidden_states = residual + hidden_states
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = self.mlp(hidden_states, residual, **kwargs)
 
         outputs = (hidden_states,)
+        if kwargs.get("output_attentions", False):
+            outputs += (attn_weights,)
+        if kwargs.get("use_cache", False):
+            outputs += (present,)
 
-        if output_attentions:
-            outputs += (self_attn_weights,)
+        return outputs
 
-        if use_cache:
-            outputs += (present_key_value,)
+
+class _IPEXFalconDecoderLayer(nn.Module):
+    def __init__(self, module, config):
+        super().__init__()
+        _setattr_from_module(self, module)
+        self.self_attention = _IPEXFalconAttention(module.self_attention, config)
+        self.mlp = _IPEXFalconMLP(module.mlp, config)
+
+    def forward(self, hidden_states: torch.Tensor, **kwargs):
+        # Please see the original model's forward to check the parameter
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        attn_output, present, attn_weights = self.self_attention(hidden_states, **kwargs)
+        attn_output = self.self_attention.dense(attn_output)
+        hidden_states = self.mlp(hidden_states, attn_output, residual)
+
+        outputs = (hidden_states,)
+        if kwargs.get("output_attentions", False):
+            outputs += (attn_weights,)
+        if kwargs.get("use_cache", False):
+            outputs += (present,)
 
         return outputs
 
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 67e707d59..dfa14d467 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -64,7 +64,7 @@
 logger = logging.getLogger(__name__)
 
 
-_IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit")
+_IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit", "falcon", "gpt2")
 _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation")
 
 
@@ -481,7 +481,14 @@ def __init__(
             elif "_reorder_cache" in self.model_cls.__dict__:
                 self._reorder_cache = self.model_cls._reorder_cache.__get__(self)
 
-        if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon", "mistral"}:
+        if is_transformers_version(">=", "4.38.0") and model_type in {
+            "llama",
+            "phi",
+            "persimmon",
+            "mistral",
+            "falcon",
+            "gpt2",
+        }:
             self.prepare_inputs_for_generation = _ipex_prepare_inputs_for_generation
         else:
             self.prepare_inputs_for_generation = self.model_cls.prepare_inputs_for_generation.__get__(self)
@@ -500,8 +507,8 @@ def _prepare_past_key_values(self, input_ids):
         d_k = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads
         batch_size = input_ids.shape[0]
 
-        if model_type in {"mistral", "llama"}:
-            num_attention_heads = self.normalized_config.num_key_value_heads
+        if model_type in {"mistral", "llama", "falcon"}:
+            num_attention_heads = getattr(self.normalized_config, "num_key_value_heads", 1)
         else:
             num_attention_heads = self.normalized_config.num_attention_heads
 
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index 9b68266d1..cd5b34f86 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -20,7 +20,7 @@
 from huggingface_hub import HfApi, HfFolder
 
 
-MULTI_QUERY_ATTN_MODELS = {"falcon", "gpt_bigcode"}
+MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"}
 
 
 def get_model_device(model: torch.nn.Module) -> torch.device:
@@ -110,3 +110,28 @@ def _find_files_matching_pattern(
         files = [Path(p) for p in repo_files if re.match(pattern, str(p)) and str(p.parent) == subfolder]
 
     return files
+
+
+def replace_customized_linear_with_linear(model):
+    """
+    Replace custom linear to torch linear so ipex could recognize and replace them to ipex linear.
+    """
+    if isinstance(model, torch.jit.ScriptModule):
+        return
+    if not model.training:
+        for child_name, child in model.named_children():
+            if isinstance(child, torch.nn.Linear) and child.__class__.__name__ in [
+                "FalconLinear",
+                "Linear",
+            ]:
+                new_m = torch.nn.Linear(
+                    child.in_features,
+                    child.out_features,
+                    bias=False if child.bias is None else True,
+                )
+                new_m.weight = child.weight
+                if child.bias is not None:
+                    new_m.bias = child.bias
+                setattr(model, child_name, new_m)
+            else:
+                replace_customized_linear_with_linear(child)
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index 7b042a4e0..01f935292 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -213,6 +213,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase):
         "blenderbot-small",
         "bloom",
         "codegen",
+        "falcon",
         "gpt2",
         "gpt_neo",
         "gpt_neox",
@@ -220,10 +221,11 @@ class IPEXModelForCausalLMTest(unittest.TestCase):
         "llama",
         "llama2",
         # "phi",
+        "distilgpt2",
         "mpt",
         "opt",
     )
-    IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2",)
+    IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "distilgpt2", "falcon")
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.0
 
@@ -263,8 +265,9 @@ def test_compare_to_transformers(self, model_arch):
 
         # Compare tensor outputs
         self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
-        self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits))
-        self.assertTrue(torch.equal(outputs.logits, init_model_outputs.logits))
+        # To avoid float pointing error
+        self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7))
+        self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_pipeline(self, model_arch):
@@ -281,7 +284,8 @@ def test_pipeline(self, model_arch):
     # High optimized model llama is not supported assisted decoding for now.
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_assisted_decoding(self, model_arch):
-        if model_arch == "llama2":
+        # Patched models are not support assisted decoding for now.
+        if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES:
             return
         model_id = MODEL_NAMES[model_arch]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -335,6 +339,24 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache):
                 self.assertIsInstance(outputs, torch.Tensor)
                 self.assertTrue(torch.equal(outputs, transformers_outputs))
 
+    @parameterized.expand(IPEX_PATCHED_SUPPORTED_ARCHITECTURES)
+    @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
+    def test_patched_model(self, model_arch):
+        model_id = MODEL_NAMES[model_arch]
+        patched_model_id = MODEL_NAMES["patched_" + model_arch]
+        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
+        exported_model = IPEXModelForCausalLM.from_pretrained(patched_model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer(
+            "This is a sample",
+            return_tensors="pt",
+            return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
+        )
+        inputs = ipex_model.prepare_inputs_for_generation(**tokens)
+        ipex_outputs = ipex_model(**inputs)
+        exported_outputs = exported_model(**inputs)
+        self.assertTrue(torch.allclose(ipex_outputs.logits, exported_outputs.logits, atol=1e-7))
+
     def test_compare_with_and_without_past_key_values(self):
         model_id = "echarlaix/tiny-random-gpt2-torchscript"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/tests/ipex/utils_tests.py b/tests/ipex/utils_tests.py
index a14f0bf7c..595bc0246 100644
--- a/tests/ipex/utils_tests.py
+++ b/tests/ipex/utils_tests.py
@@ -25,8 +25,10 @@
     "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
     "convnext": "hf-internal-testing/tiny-random-convnext",
     "distilbert": "hf-internal-testing/tiny-random-distilbert",
+    "distilgpt2": "Jiqing/tiny_random_distilgpt2",
     "electra": "hf-internal-testing/tiny-random-electra",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
+    "falcon": "Jiqing/tiny_random_falcon",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
@@ -54,4 +56,7 @@
     "vit": "hf-internal-testing/tiny-random-vit",
     "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier",
     "xlm": "hf-internal-testing/tiny-random-xlm",
+    "patched_falcon": "Jiqing/patched_tiny_random_falcon_for_causal_lm",
+    "patched_distilgpt2": "Jiqing/patched_tiny_random_distilgpt2_for_causal_lm",
+    "patched_llama2": "Jiqing/patched_tiny_random_llama2_for_causal_lm",
 }

From 403c696bb1c57284b88086d0ad961361c5bc6574 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Tue, 27 Aug 2024 16:52:08 +0800
Subject: [PATCH 16/20] Add IPEX documentation (#828)

* change readme, source/index, source/installation

* add ipex doc 1st step

* update readme for command line usage

* fix bug for ipex readme

* add export doc

* update all ipex docs

* rm diffusers

* change register

* Update README.md

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* Update docs/source/installation.mdx

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* fix readme

* fix ipex exporter args comments

* extend ipex export explain

* fix ipex reference.mdx

* add comments for auto doc

* rm cli export

* Update optimum/commands/export/ipex.py

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* rm commit hash in export command

* rm export

* rm jit

* add ipex on doc's docker file

* indicate that ipex model only supports for cpu and the export format will be changed to compile in the future

* Update docs/source/ipex/inference.mdx

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>

* explain patching

* rm ipex reference

* Update docs/source/ipex/inference.mdx

* Update docs/source/ipex/inference.mdx

* Update docs/source/ipex/inference.mdx

* Update docs/source/index.mdx

* Update docs/source/ipex/inference.mdx

* Update docs/source/ipex/models.mdx

* Update docs/Dockerfile

---------

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 README.md                                |  3 +-
 docs/source/_toctree.yml                 | 11 ++++++
 docs/source/index.mdx                    |  2 ++
 docs/source/installation.mdx             |  3 +-
 docs/source/ipex/inference.mdx           | 45 +++++++++++++++++++++++
 docs/source/ipex/models.mdx              | 46 ++++++++++++++++++++++++
 docs/source/ipex/tutorials/notebooks.mdx | 16 +++++++++
 optimum/intel/ipex/modeling_base.py      | 36 ++++++++++++++++++-
 optimum/intel/ipex/utils.py              |  4 +++
 9 files changed, 162 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/ipex/inference.mdx
 create mode 100644 docs/source/ipex/models.mdx
 create mode 100644 docs/source/ipex/tutorials/notebooks.mdx

diff --git a/README.md b/README.md
index 3dbe557e5..97337f723 100644
--- a/README.md
+++ b/README.md
@@ -223,7 +223,6 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
   results = pipe("He's a dreadful magician and")
-
 ```
 
 For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).
@@ -231,7 +230,7 @@ For more details, please refer to the [documentation](https://intel.github.io/in
 
 ## Running the examples
 
-Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference.
+Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) and [`notebooks`](https://github.com/huggingface/optimum-intel/tree/main/notebooks) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference.
 
 Do not forget to install requirements for every example:
 
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 7053a17ef..94ae09bb6 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -30,5 +30,16 @@
       title: Tutorials
       isExpanded: false
     title: OpenVINO
+  - sections:
+    - local: ipex/inference
+      title: Inference
+    - local: ipex/models
+      title: Supported Models
+    - sections:
+      - local: ipex/tutorials/notebooks
+        title: Notebooks
+      title: Tutorials
+      isExpanded: false
+    title: IPEX
   title: Optimum Intel
   isExpanded: false
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 75e99d868..c9ad66206 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -19,6 +19,8 @@ limitations under the License.
 
 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
 
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) (IPEX) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
+
 [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
 
 [OpenVINO](https://docs.openvino.ai) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/2024/about-openvino/compatibility-and-support/supported-devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime.
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index aaab1b1f8..cb3e9c758 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -22,6 +22,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi
 |:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------|
 | [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade --upgrade-strategy eager "optimum[neural-compressor]"`|
 | [Intel OpenVINO](https://docs.openvino.ai )                                                                            | `pip install --upgrade --upgrade-strategy eager "optimum[openvino]"`         |
+| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction)                       | `pip install --upgrade --upgrade-strategy eager "optimum[ipex]"`         |
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
 
@@ -42,4 +43,4 @@ or to install from source including dependencies:
 python -m pip install "optimum-intel[extras]"@git+https://github.com/huggingface/optimum-intel.git
 ```
 
-where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`.
+where `extras` can be one or more of `neural-compressor`, `openvino`, `ipex`.
diff --git a/docs/source/ipex/inference.mdx b/docs/source/ipex/inference.mdx
new file mode 100644
index 000000000..c712275e4
--- /dev/null
+++ b/docs/source/ipex/inference.mdx
@@ -0,0 +1,45 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Inference
+
+Optimum Intel can be used to load models from the [Hub](https://huggingface.co/models) and create pipelines to run inference with IPEX optimizations (including patching with custom operators, weight prepacking and graph mode) on a variety of Intel processors. For now support is only enabled for CPUs.
+
+
+## Loading
+
+You can load your model and apply IPEX optimizations (including weight prepacking and graph mode). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
+For now, support is only enabled for CPUs and the original model will be exported via TorchScript. In the future `torch.compile` will be used and model exported via TorchScript will get deprecated.
+
+```diff
+  import torch
+  from transformers import AutoTokenizer, pipeline
+- from transformers import AutoModelForCausalLM
++ from optimum.intel import IPEXModelForCausalLM
+
+  model_id = "gpt2"
+- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
+  tokenizer = AutoTokenizer.from_pretrained(model_id)
+  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+  results = pipe("He's a dreadful magician and")
+```
+
+As shown in the table below, each task is associated with a class enabling to automatically load your model.
+
+| Auto Class                           | Task                                 |
+|--------------------------------------|--------------------------------------|
+| `IPEXModelForSequenceClassification` | `text-classification`                |
+| `IPEXModelForTokenClassification`    | `token-classification`               |
+| `IPEXModelForQuestionAnswering`      | `question-answering`                 |
+| `IPEXModelForImageClassification`    | `image-classification`               |
+| `IPEXModel`                          | `feature-extraction`                 |
+| `IPEXModelForMaskedLM`               | `fill-mask`                          |
+| `IPEXModelForAudioClassification`    | `audio-classification`               |
+| `IPEXModelForCausalLM`               | `text-generation`                    |
diff --git a/docs/source/ipex/models.mdx b/docs/source/ipex/models.mdx
new file mode 100644
index 000000000..346ca2659
--- /dev/null
+++ b/docs/source/ipex/models.mdx
@@ -0,0 +1,46 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Supported models
+
+🤗 Optimum provides IPEX optimizations for both eager mode and graph mode. It provides classes and functions to perform this step easily.
+Here is the list of the supported architectures :
+
+## [Transformers](https://huggingface.co/docs/transformers/index)
+
+- Albert
+- Bart
+- Beit
+- Bert
+- BlenderBot
+- BlenderBotSmall
+- Bloom
+- CodeGen
+- DistilBert
+- Electra
+- Flaubert
+- GPT-2
+- GPT-BigCode
+- GPT-Neo
+- GPT-NeoX
+- Llama
+- MPT
+- Mistral
+- MobileNet v1
+- MobileNet v2
+- MobileVit
+- OPT
+- ResNet
+- Roberta
+- Roformer
+- SqueezeBert
+- UniSpeech
+- Vit
+- Wav2Vec2
+- XLM
diff --git a/docs/source/ipex/tutorials/notebooks.mdx b/docs/source/ipex/tutorials/notebooks.mdx
new file mode 100644
index 000000000..2093e4fca
--- /dev/null
+++ b/docs/source/ipex/tutorials/notebooks.mdx
@@ -0,0 +1,16 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Notebooks
+
+## Inference
+
+| Notebook                                                                                                                   | Description                                                                                                |                                                                                                                                                                                                          |       |
+|:---------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------- |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|------:|
+| [How to run inference with the IPEX](https://github.com/huggingface/optimum-intel/tree/main/notebooks/ipex)                | Explains how to export your model to IPEX and to run inference with IPEX model on text-generation task     | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/ipex/text_generation.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/ipex/text_generation.ipynb)          |
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index dfa14d467..568e5be62 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -198,7 +198,7 @@ def _from_pretrained(
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        cache_dir: Union[str, Path] = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
@@ -206,6 +206,40 @@ def _from_pretrained(
         file_name: Optional[str] = WEIGHTS_NAME,
         **kwargs,
     ):
+        """
+        Loads a model and its configuration file from a directory or the HF Hub.
+
+        Arguments:
+            model_id (`str` or `Path`):
+                The directory from which to load the model.
+                Can be either:
+                    - The model id of a pretrained model hosted inside a model repo on huggingface.co.
+                    - The path to a directory containing the model weights.
+            use_auth_token (Optional[Union[bool, str]], defaults to `None`):
+                Deprecated. Please use `token` instead.
+            token (Optional[Union[bool, str]], defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id.
+            force_download (`bool`, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, Path]`, *optional*):
+                The path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            subfolder (`str`, *optional*)
+                In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can specify the folder name here.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            torch_dtype (`Optional[Union[str, "torch.dtype"]]`, *optional*)
+                float16 or bfloat16 or float32: load in a specified dtype, ignoring the model config.torch_dtype if one exists. If not specified, the model will get loaded in float32.
+            trust_remote_code (`bool`, *optional*)
+                Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository.
+            file_name (`str`, *optional*):
+                The file name of the model to load. Overwrites the default file name and allows one to load the model
+                with a different name.
+        """
         if use_auth_token is not None:
             warnings.warn(
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
diff --git a/optimum/intel/ipex/utils.py b/optimum/intel/ipex/utils.py
index b2644e659..3d3feb3db 100644
--- a/optimum/intel/ipex/utils.py
+++ b/optimum/intel/ipex/utils.py
@@ -14,8 +14,12 @@
 
 
 _HEAD_TO_AUTOMODELS = {
+    "feature-extraction": "IPEXModel",
     "text-generation": "IPEXModelForCausalLM",
     "text-classification": "IPEXModelForSequenceClassification",
     "token-classification": "IPEXModelForTokenClassification",
     "question-answering": "IPEXModelForQuestionAnswering",
+    "fill-mask": "IPEXModelForMaskedLM",
+    "image-classification": "IPEXModelForImageClassification",
+    "audio-classification": "IPEXModelForAudioClassification",
 }

From 9a18ae0119ee5e6669b42f49a6fe7ac3397ac55b Mon Sep 17 00:00:00 2001
From: rbrugaro <rita.brugarolas.brufau@intel.com>
Date: Tue, 27 Aug 2024 09:19:47 -0700
Subject: [PATCH 17/20] set cpu affinity and membind for better oob performance
 (#853)

* set num threads and memory binding for better OOB performance

* clean env var

* added core and memory binding util for improved performance

* add example usage in docstring

* change utlity for best oob to support world_size and rank >=1

* fix style

* fix node_id value to account for rank_id starts at zero

* numa node assignment calculated from local size not from world size

* reorg imports, moved checks to import_utils, remove prints for logger

* raise Errors with missing pkg and unsupported OS

* added missng env var to list

* Update optimum/intel/utils/modeling_utils.py

* Update optimum/intel/utils/import_utils.py

* Update optimum/intel/utils/import_utils.py

* fix style quality error

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 docker/Dockerfile.intel               |  9 +--
 optimum/intel/utils/__init__.py       |  1 +
 optimum/intel/utils/import_utils.py   | 12 ++++
 optimum/intel/utils/modeling_utils.py | 82 +++++++++++++++++++++++++++
 4 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel
index 60fd51b42..a7f1dc978 100644
--- a/docker/Dockerfile.intel
+++ b/docker/Dockerfile.intel
@@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     libpng-dev \
     python3 \
     python3-pip \
+    python3-dev \
+    libnuma-dev \
     && rm -rf /var/lib/apt/lists/*"
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
@@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \
     torchaudio==${TORCHAUDIO_VERSION} \
     -f https://download.pytorch.org/whl/torch_stable.html && \
     python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
-    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python3 -m pip install --no-cache-dir  numa
 
-ARG OMP_NUM_THREADS=1
-ENV OMP_NUM_THREADS=${OMP_NUM_THREADS}
 ARG KMP_BLOCKTIME=1
 ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
 ARG KMP_HW_SUBSET=1T
 ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
-ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
\ No newline at end of file
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
diff --git a/optimum/intel/utils/__init__.py b/optimum/intel/utils/__init__.py
index d77588f89..50cdfa143 100644
--- a/optimum/intel/utils/__init__.py
+++ b/optimum/intel/utils/__init__.py
@@ -22,6 +22,7 @@
     is_neural_compressor_available,
     is_neural_compressor_version,
     is_nncf_available,
+    is_numa_available,
     is_openvino_available,
     is_torch_version,
     is_transformers_available,
diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py
index 6be0aac47..032280e94 100644
--- a/optimum/intel/utils/import_utils.py
+++ b/optimum/intel/utils/import_utils.py
@@ -150,6 +150,14 @@
     except importlib_metadata.PackageNotFoundError:
         _accelerate_available = False
 
+_numa_available = importlib.util.find_spec("numa") is not None
+
+if _numa_available:
+    try:
+        importlib_metadata.version("numa")
+    except importlib_metadata.PackageNotFoundError:
+        _numa_available = False
+
 
 def is_transformers_available():
     return _transformers_available
@@ -272,6 +280,10 @@ def is_accelerate_available():
     return _accelerate_available
 
 
+def is_numa_available():
+    return _numa_available
+
+
 # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
 def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
     """
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index cd5b34f86..1d2f7b03c 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -12,16 +12,25 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import logging
+import math
+import os
+import platform
 import re
 from pathlib import Path
 from typing import List, Optional, Union
 
+import psutil
 import torch
 from huggingface_hub import HfApi, HfFolder
 
+from .import_utils import is_numa_available
+
 
 MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"}
 
+logger = logging.getLogger(__name__)
+
 
 def get_model_device(model: torch.nn.Module) -> torch.device:
     """
@@ -135,3 +144,76 @@ def replace_customized_linear_with_linear(model):
                 setattr(model, child_name, new_m)
             else:
                 replace_customized_linear_with_linear(child)
+
+
+def get_int_from_env(env_keys, default):
+    """Returns the first positive env value found in the `env_keys` list or the default."""
+    for e in env_keys:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
+
+
+def bind_cores_for_best_perf():
+    """
+    Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance.
+    Works for wold_size >= 1 and rank >= 1
+
+    Example:
+    .. code-block:: python
+
+        from optimum.intel.ipex import IPEXModelForCausalLM
+        from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
+
+        bind_cores_for_best_perf()
+        model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True)
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        input_sentence = ["tell me a story about a trip to the moon"]
+        model_inputs = tokenizer(input_sentence, return_tensors="pt")
+        generation_kwargs = dict(max_new_tokens=500)
+        generated_ids = model.generate(**model_inputs, **generation_kwargs)
+
+    Returns:
+        None
+
+    """
+    if platform.system() != "Linux":
+        logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
+        raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
+    if not is_numa_available():
+        logger.error("'numa' module not found")
+        raise ImportError("'numa' module not found, install with 'pip install numa'")
+    import numa
+
+    local_size = get_int_from_env(
+        ["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
+    )
+    rank_id = get_int_from_env(
+        ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0
+    )
+    nodes = numa.get_max_node() + 1
+    rank_per_node = math.ceil(local_size / nodes)
+    num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
+    node_id = int(rank_id / rank_per_node)
+    rank_offset_per_node = rank_id % rank_per_node
+    if os.getenv("OMP_NUM_THREADS") is None:
+        num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
+        logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance")
+    else:
+        num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
+        logger.info(f"OMP_NUM_THREADS already set to  {num_cpus_per_rank}")
+    if len(numa.get_membind()) == nodes:
+        # if numa memory binding is not set, set it to the node where the rank is running
+        numa.set_membind([node_id])
+
+    torch.set_num_threads(num_cpus_per_rank)
+
+    if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True):
+        # if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank
+        cpu_start = num_cpus_per_rank * rank_offset_per_node
+        numa.set_affinity(
+            0,
+            list(numa.node_to_cpus(node_id))[cpu_start : cpu_start + num_cpus_per_rank],
+        )
+    logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}")

From af8c28d46e2e3b589170866502093c5af34b749c Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 30 Aug 2024 15:43:45 +0400
Subject: [PATCH 18/20] Fix openvino nightly install in tests (#885)

* fix openvino nightly install in tests

* Update .github/workflows/test_openvino.yml

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 .github/workflows/test_openvino.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 13a6b83e5..226240789 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -51,7 +51,6 @@ jobs:
           pytest tests/openvino/test_modeling_basic.py
       - name: Test openvino-nightly
         run: |
-          pip uninstall -y openvino
-          pip install openvino-nightly
+          pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
           optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov

From d6e6e1f0350ef0b66dab5266196d56f3a5dd4c7c Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 30 Aug 2024 17:06:07 +0400
Subject: [PATCH 19/20] Fix attention mask for glm4 (#884)

---
 optimum/exporters/openvino/model_patcher.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 6e65f4f11..8cb745bd7 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -308,10 +308,9 @@ def _chatglm2_core_attention_forward(self, query_layer, key_layer, value_layer,
 
 
 def _glm4_core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask):
-    attention_mask = ~attention_mask
-    context_layer = torch.nn.functional.scaled_dot_product_attention(
-        query_layer, key_layer, value_layer, attention_mask.to(torch.float32)
-    )
+    causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)
+    causal_mask.masked_fill_(attention_mask, float("-inf"))
+    context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, causal_mask)
     context_layer = context_layer.transpose(1, 2).contiguous()
     new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
     context_layer = context_layer.reshape(*new_context_layer_shape)

From b5998f2f44e581b102ed7a9b714ac0f7c2d51a66 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 30 Aug 2024 16:11:11 +0200
Subject: [PATCH 20/20] Apply weight compression after model save to reduce
 peak RAM during export (#878)

* Initial commit

* Style

* Adopt tests

* Add no-nncf warning

* Apply suggested changes

* Do not save in fp16 in case of weight compression

* Replace model files right away
---
 optimum/exporters/openvino/__main__.py |  50 ++++++++-
 optimum/exporters/openvino/convert.py  |  38 +------
 tests/openvino/test_quantization.py    | 138 +++++++++++++------------
 3 files changed, 123 insertions(+), 103 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 77f804960..c4b6ef0cd 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -14,7 +14,9 @@
 
 import gc
 import logging
+import operator
 import warnings
+from functools import reduce
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
@@ -23,18 +25,20 @@
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
 from transformers.utils import is_torch_available
 
+from openvino.runtime import Core, Type, save_model
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.openvino.convert import export_from_model
 from optimum.intel.utils.import_utils import (
+    is_nncf_available,
     is_openvino_tokenizers_available,
     is_openvino_version,
     is_transformers_version,
 )
 from optimum.utils.save_utils import maybe_load_preprocessors
 
-from .utils import clear_class_registry
+from .utils import _MAX_UNCOMPRESSED_SIZE, clear_class_registry
 
 
 if TYPE_CHECKING:
@@ -402,7 +406,7 @@ class StoreAttr(object):
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
 
-    export_from_model(
+    submodel_paths = export_from_model(
         model=model,
         output=output,
         task=task,
@@ -425,6 +429,48 @@ class StoreAttr(object):
     del model
     gc.collect()
 
+    core = Core()
+    for submodel_path in submodel_paths:
+        submodel_path = Path(output) / submodel_path
+        submodel = core.read_model(submodel_path)
+
+        quantization_config = None
+        if ov_config is None:
+            num_parameters = 0
+            for op in submodel.get_ops():
+                if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]:
+                    num_parameters += reduce(operator.mul, op.shape, 1)
+            if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
+                if is_nncf_available():
+                    quantization_config = {"bits": 8, "sym": False}
+                    logger.info("The model weights will be quantized to int8_asym.")
+                else:
+                    logger.warning(
+                        "The model will be converted with no weights quantization. Quantization of the weights to int8 "
+                        "requires nncf. Please install it with `pip install nncf`"
+                    )
+                    break
+        else:
+            quantization_config = ov_config.quantization_config
+        if quantization_config is None:
+            continue
+
+        if not is_nncf_available():
+            raise ImportError("Quantization of the weights requires nncf, please install it with `pip install nncf`")
+
+        from optimum.intel.openvino.quantization import _weight_only_quantization
+
+        _weight_only_quantization(submodel, quantization_config)
+
+        compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
+        save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
+        del submodel
+
+        submodel_path.unlink()
+        submodel_path.with_suffix(".bin").unlink()
+        compressed_submodel_path.rename(submodel_path)
+        compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin"))
+
     # Unpatch modules after GPTQ export
     if do_gptq_patching:
         torch.cuda.is_available = orig_cuda_check
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 0b937734c..dc2af6878 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -49,7 +49,6 @@
 from .model_patcher import patch_model_with_bettertransformer
 from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful
 from .utils import (
-    _MAX_UNCOMPRESSED_SIZE,
     OV_XML_FILE_NAME,
     clear_class_registry,
     flattenize_inputs,
@@ -76,21 +75,7 @@
 
 
 def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
-    compress_to_fp16 = False
-
-    if ov_config is not None:
-        if ov_config.quantization_config:
-            if not is_nncf_available():
-                raise ImportError(
-                    "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
-                )
-
-            from optimum.intel.openvino.quantization import _weight_only_quantization
-
-            _weight_only_quantization(model, ov_config.quantization_config)
-
-        compress_to_fp16 = ov_config.dtype == "fp16"
-
+    compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
     model = _add_version_info_to_model(model, library_name)
     save_model(model, path, compress_to_fp16)
 
@@ -643,25 +628,6 @@ def export_from_model(
     )
     logging.disable(logging.NOTSET)
 
-    if ov_config is None:
-        if library_name == "diffusers":
-            num_parameters = model.unet.num_parameters()
-        else:
-            num_parameters = sum(param.numel() for param in list(model.parameters()) if param.requires_grad)
-
-        if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
-            if is_nncf_available():
-                from ...intel.openvino.configuration import OVConfig
-
-                ov_config = OVConfig(quantization_config={"bits": 8, "sym": False})
-
-                logger.info("The model weights will be quantized to int8_asym.")
-            else:
-                logger.warning(
-                    "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
-                    "please install it with `pip install nncf`"
-                )
-
     if library_name != "diffusers":
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
@@ -720,6 +686,8 @@ def export_from_model(
         patch_16bit_model=patch_16bit_model,
     )
 
+    return files_subpaths
+
 
 def export_tokenizer(
     tokenizer,
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 23ff3a03c..5835bc76a 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import inspect
 
 # ruff: noqa
 
@@ -22,6 +23,7 @@
 from enum import Enum
 from functools import partial
 from typing import Union
+
 import pytest
 import evaluate
 import numpy as np
@@ -538,76 +540,80 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
             self.assertEqual(0, num_int8)
 
     def test_ovmodel_load_large_model_with_default_compressed_weights(self):
-        with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters:
-            mock_tensor = unittest.mock.Mock()
-            mock_tensor.numel = lambda: 2000000000
-            mock_tensor.requires_grad = True
-            model_parameters.return_value = [mock_tensor]
-            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
-                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
-                    _ = OVModelForCausalLM.from_pretrained(
-                        MODEL_NAMES["llama"], export=True, compile=False, use_cache=False
-                    )
-                    save_model_patch.assert_called_with(
-                        unittest.mock.ANY,
-                        unittest.mock.ANY,
-                        ov_config=OVConfig(quantization_config={"bits": 8}),
-                        library_name="transformers",
-                    )
+        def main_export_in_stacktrace(*args, **kwargs):
+            # Compression was called from `main_export`
+            self.assertTrue(inspect.stack()[5].function == "main_export")
+
+        with unittest.mock.patch(
+            "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
+        ) as ov_constant_shape:
+            ov_constant_shape.return_value = (2000000000,)
+            with unittest.mock.patch(
+                "nncf.compress_weights", side_effect=main_export_in_stacktrace
+            ) as compress_weights_patch:
+                _ = OVModelForCausalLM.from_pretrained(
+                    MODEL_NAMES["llama"], export=True, compile=False, use_cache=False
+                )
+                compression_params = {
+                    "mode": nncf.CompressWeightsMode.INT8_ASYM,
+                    "ratio": 1.0,
+                    "group_size": -1,
+                    "all_layers": None,
+                    "sensitivity_metric": None,
+                    "dataset": None,
+                    "ignored_scope": nncf.IgnoredScope(),
+                    "awq": None,
+                    "subset_size": 128,
+                    "scale_estimation": None,
+                }
+                compress_weights_patch.assert_called_with(
+                    unittest.mock.ANY,
+                    **compression_params,
+                )
 
     def test_ovmodel_load_large_model_with_uncompressed_weights(self):
-        with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters:
-            mock_tensor = unittest.mock.Mock()
-            mock_tensor.numel = lambda: 2000000000
-            mock_tensor.requires_grad = True
-            model_parameters.return_value = [mock_tensor]
-            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
-                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
-                    _ = OVModelForCausalLM.from_pretrained(
-                        MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False
-                    )
-                    save_model_patch.assert_called_with(
-                        unittest.mock.ANY,
-                        unittest.mock.ANY,
-                        ov_config=OVConfig(dtype="auto"),
-                        library_name="transformers",
-                    )
+        with unittest.mock.patch(
+            "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
+        ) as ov_constant_shape:
+            ov_constant_shape.return_value = (2000000000,)
+            with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch:
+                _ = OVModelForCausalLM.from_pretrained(
+                    MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False
+                )
+                compress_weights_patch.assert_not_called()
 
     def test_ovmodel_load_large_model_with_additional_quantization_config(self):
-        with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters:
-            mock_tensor = unittest.mock.Mock()
-            mock_tensor.numel = lambda: 2000000000
-            mock_tensor.requires_grad = True
-            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
-                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
-                    with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch:
-                        _ = OVModelForCausalLM.from_pretrained(
-                            MODEL_NAMES["llama"],
-                            export=True,
-                            compile=False,
-                            use_cache=False,
-                            quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8),
-                        )
-                        # quantization will be performed later, using load_model
-                        save_model_patch.assert_called_with(
-                            unittest.mock.ANY,
-                            unittest.mock.ANY,
-                            ov_config=OVConfig(dtype="auto"),
-                            library_name="transformers",
-                        )
-                        compression_params = {
-                            "mode": nncf.CompressWeightsMode.INT4_SYM,
-                            "ratio": 0.8,
-                            "group_size": -1,
-                            "all_layers": None,
-                            "sensitivity_metric": None,
-                            "dataset": None,
-                            "ignored_scope": nncf.IgnoredScope(),
-                            "awq": None,
-                            "subset_size": 128,
-                            "scale_estimation": None,
-                        }
-                        compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)
+        def main_export_not_in_stacktrace(*args, **kwargs):
+            # Compression was not called from `main_export`
+            self.assertTrue(all(frame_info.function != "main_export" for frame_info in inspect.stack()))
+
+        with unittest.mock.patch(
+            "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
+        ) as ov_constant_shape:
+            ov_constant_shape.return_value = (2000000000,)
+            with unittest.mock.patch(
+                "nncf.compress_weights", side_effect=main_export_not_in_stacktrace
+            ) as compress_weights_patch:
+                _ = OVModelForCausalLM.from_pretrained(
+                    MODEL_NAMES["llama"],
+                    export=True,
+                    compile=False,
+                    use_cache=False,
+                    quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8),
+                )
+                compression_params = {
+                    "mode": nncf.CompressWeightsMode.INT4_SYM,
+                    "ratio": 0.8,
+                    "group_size": -1,
+                    "all_layers": None,
+                    "sensitivity_metric": None,
+                    "dataset": None,
+                    "ignored_scope": nncf.IgnoredScope(),
+                    "awq": None,
+                    "subset_size": 128,
+                    "scale_estimation": None,
+                }
+                compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantization_config, expected_ov_int4):