From 50f4c3449c2c3d2c69a406be09f26ee1b100a5e0 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Tue, 6 Aug 2024 16:15:20 +0800
Subject: [PATCH 01/21] update new model list with new reuploaded model and
 ipex option in modelui

---
 src/embeddedllm/entrypoints/modelui.py | 239 +++++++++++++++++--------
 1 file changed, 169 insertions(+), 70 deletions(-)

diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index ca1da44..e6fd1b0 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -64,45 +64,103 @@ class ModelCard(BaseModel):
     context_length: int
     size: Optional[int] = 0
 
+ipex_model_dict_list = {
+    "microsoft/Phi-3-mini-4k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/",
+        repo_id="microsoft/Phi-3-mini-4k-instruct",
+        model_name="Phi-3-mini-4k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "microsoft/Phi-3-mini-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-mini-128k-instruct",
+        model_name="Phi-3-mini-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "microsoft/Phi-3-medium-4k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-4k-instruct",
+        model_name="Phi-3-medium-4k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "microsoft/Phi-3-medium-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-128k-instruct",
+        model_name="Phi-3-medium-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+}
 
 dml_model_dict_list = {
-    "microsoft/Phi-3-mini-4k-instruct": ModelCard(
-        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/directml/directml-int4-awq-block-128",
-        repo_id="microsoft/Phi-3-mini-4k-instruct-onnx",
-        model_name="Phi-3-mini-4k-instruct-onnx",
-        subfolder="directml/directml-int4-awq-block-128",
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml",
+        model_name="Phi-3-mini-4k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml",
+        model_name="Phi-3-mini-128k-instruct-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4",
-        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx",
-        model_name="Phi-3-mini-4k-instruct-062024-onnx",
-        subfolder="onnx/directml/Phi-3-mini-4k-instruct-062024-int4",
+    "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml",
+        model_name="Phi-3-medium-4k-instruct-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml",
+        model_name="Phi-3-medium-128k-instruct-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4",
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml",
+        model_name="Phi-3-mini-4k-instruct-062024-int4-onnx-directml",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main",
         repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-onnx",
-        subfolder="onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4",
+        model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/gemma-2b-it-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx/tree/main/onnx/directml/gemma-2b-it-int4",
-        repo_id="EmbeddedLLM/gemma-2b-it-onnx",
-        model_name="gemma-2b-it-int4",
-        subfolder="onnx/directml/gemma-2b-it-int4",
+    "EmbeddedLLM/gemma-2b-it-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/gemma-2b-it-int4-onnx-directml",
+        model_name="gemma-2b-it-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/gemma-7b-it-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-onnx/tree/main/onnx/directml/gemma-7b-it-int4",
-        repo_id="EmbeddedLLM/gemma-7b-it-onnx",
-        model_name="gemma-7b-it-int4",
-        subfolder="onnx/directml/gemma-7b-it-int4",
+    "EmbeddedLLM/gemma-7b-it-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/gemma-7b-it-int4-onnx-directml",
+        model_name="gemma-7b-it-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
@@ -114,70 +172,94 @@ class ModelCard(BaseModel):
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/Starling-LM-7b-beta-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-onnx/tree/main/onnx/directml/Starling-LM-7b-beta-int4",
-        repo_id="EmbeddedLLM/Starling-LM-7b-beta-onnx",
-        model_name="Starling-LM-7b-beta-int4",
-        subfolder="onnx/directml/Starling-LM-7b-beta-int4",
+    "EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml",
+        model_name="Starling-LM-7b-beta-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/directml/openchat-3.6-8b-20240522-int4",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-int4",
-        subfolder="onnx/directml/openchat-3.6-8b-20240522-int4",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml",
+        model_name="openchat-3.6-8b-20240522-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx/tree/main/onnx/directml/01-ai_Yi-1.5-6B-Chat-int4",
-        repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx",
-        model_name="01-ai_Yi-1.5-6B-Chat-int4",
-        subfolder="onnx/directml/01-ai_Yi-1.5-6B-Chat-int4",
+    "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml/tree/main",
+        repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml",
+        model_name="01-ai_Yi-1.5-6B-Chat-int4-onnx-directml",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
 }
 
 cpu_model_dict_list = {
-    "microsoft/Phi-3-mini-4k-instruct": ModelCard(
-        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="microsoft/Phi-3-mini-4k-instruct-onnx",
-        model_name="Phi-3-mini-4k-instruct-onnx",
-        subfolder="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32",
+        model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=4096,
     ),
-    "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
-        subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32",
+        model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
-        model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
-        subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32",
+    "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32",
+        model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=32768,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4",
-        subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
-    "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32": ModelCard(
-        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32",
-        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx",
-        model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32",
-        subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32",
+    "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32": ModelCard(
+        hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main",
+        repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32",
+        model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32",
+        subfolder=".",
         repo_type="model",
         context_length=8192,
     ),
@@ -231,8 +313,13 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"):
         repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
     )
 
+for k, v in ipex_model_dict_list.items():
+    v.size = compute_memory_size(
+        repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
+    )
+
 
-def convert_to_dataframe(dml_model_dict_list):
+def convert_to_dataframe(model_dict_list):
     # Create lists to store the data
     model_names = []
     hf_urls = []
@@ -244,7 +331,7 @@ def convert_to_dataframe(dml_model_dict_list):
     context_lengths = []
 
     # Iterate through the dictionary and extract the data
-    for key, model_card in dml_model_dict_list.items():
+    for key, model_card in model_dict_list.items():
         model_names.append(key)
         hf_urls.append(model_card.hf_url)
         repo_ids.append(model_card.repo_id)
@@ -318,6 +405,9 @@ def update_model_list(engine_type):
     if engine_type == "DirectML":
         models = sorted(list(dml_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(dml_model_dict_list)
+    elif engine_type == "Ipex":
+        models = sorted(list(ipex_model_dict_list.keys()))
+        models_pandas = convert_to_dataframe(ipex_model_dict_list)
     else:
         models = sorted(list(cpu_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(cpu_model_dict_list)
@@ -340,12 +430,14 @@ def deploy_model(engine_type, model_name, port_number):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif engine_type == "Ipex":
+        llm_model_card = ipex_model_dict_list[model_name]
     else:
         llm_model_card = cpu_model_dict_list[model_name]
 
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*",
+        allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None,
         repo_type="model",
     )
 
@@ -402,6 +494,8 @@ def download_model(engine_type, model_name):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif engine_type == "Ipex":
+        llm_model_card = ipex_model_dict_list[model_name]
     else:
         llm_model_card = cpu_model_dict_list[model_name]
 
@@ -412,7 +506,7 @@ def download_model(engine_type, model_name):
     yield "Downloading ..."
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*",
+        allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None,
         repo_type="model",
     )
     yield snapshot_path
@@ -443,9 +537,14 @@ def main():
         with gr.Accordion("See More Model Details", open=False):
             model_info_pandas_frame = gr.Dataframe(value=None)
 
+        default_value = "CPU"  # Default value
+        if backend == "directml":
+            default_value = "DirectML"
+        elif backend == "ipex":
+            default_value = "Ipex"
         selected_engine_type = gr.Dropdown(
-            choices=["DirectML", "CPU"],
-            value="DirectML" if backend == "directml" else "CPU",
+            choices=["DirectML", "Ipex", "CPU"],
+            value = default_value,
             multiselect=False,
             label="LLM Engine",
             show_label=True,

From ec2f421cab254685bb0c90ba89424bc475a078fd Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Tue, 6 Aug 2024 16:49:01 +0800
Subject: [PATCH 02/21] fix the typo of mistral repo id

---
 src/embeddedllm/entrypoints/modelui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index e6fd1b0..212c67b 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -142,7 +142,7 @@ class ModelCard(BaseModel):
     ),
     "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard(
         hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main",
-        repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx",
+        repo_id="EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml",
         model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml",
         subfolder=".",
         repo_type="model",

From dbdefa04032d8cb1e473130bb00ece8757a68c80 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Tue, 6 Aug 2024 17:45:54 +0800
Subject: [PATCH 03/21] edit to the latest version of models available

---
 README.md                                 |  5 +++--
 docs/model/onnxruntime_cpu_models.md      | 14 ++++++++++++++
 docs/model/onnxruntime_directml_models.md | 19 +++++++++++++++++++
 docs/model/onnxruntime_models.md          | 19 -------------------
 4 files changed, 36 insertions(+), 21 deletions(-)
 create mode 100644 docs/model/onnxruntime_cpu_models.md
 create mode 100644 docs/model/onnxruntime_directml_models.md
 delete mode 100644 docs/model/onnxruntime_models.md

diff --git a/README.md b/README.md
index 99542ed..2fcfed5 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 ## Table Content
 
 - [Supported Models](#supported-models-quick-start)
-  - [Onnxruntime Models](./docs/model/onnxruntime_models.md)
+  - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md)
+  - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md)
   - [Ipex-LLM Models](./docs/model/ipex_models.md)
 - [Getting Started](#getting-started)
   - [Installation From Source](#installation)
@@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
 | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
 | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
+| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) |
 | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
 | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
 | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
diff --git a/docs/model/onnxruntime_cpu_models.md b/docs/model/onnxruntime_cpu_models.md
new file mode 100644
index 0000000..6951ac8
--- /dev/null
+++ b/docs/model/onnxruntime_cpu_models.md
@@ -0,0 +1,14 @@
+# Model Powered by Onnxruntime CPU GenAI
+
+## Supported Models
+
+| Model Name                                            | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|-------------------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32    | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.538     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32   | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) |
+| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B          | 4096           | 2.585     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32  | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main) |
+| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4 | 7B         | 32768          | 4.66      | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32  | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main) |
+| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4 | 8B         | 8192           | 6.339     | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) |
diff --git a/docs/model/onnxruntime_directml_models.md b/docs/model/onnxruntime_directml_models.md
new file mode 100644
index 0000000..0f6a3a3
--- /dev/null
+++ b/docs/model/onnxruntime_directml_models.md
@@ -0,0 +1,19 @@
+# Model Powered by Onnxruntime DirectML GenAI
+
+## Supported Models
+
+| Model Name                                 | Parameters | Context Length | Size (GB) | Link                                                                                                                |
+|--------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------|
+| Phi-3-mini-4k-instruct-onnx-directml       | 3.8B       | 4096           | 1.989     | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml) |
+| Phi-3-mini-128k-instruct-onnx-directml      | 3.8B       | 131072           | 2.018     | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml)  |
+| Phi-3-medium-4k-instruct-onnx-directml      | 17B        | 4096           | 6.987     | [EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml)  |
+| Phi-3-medium-128k-instruct-onnx-directml    | 17B        | 131072           | 7.025     | [EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml) |
+| Phi-3-mini-4k-instruct-062024-int4-onnx-directml | 3.8B     | 4096           | 2.137     | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml) |
+| mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml | 7B  | 32768          | 3.988     | [EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml) |
+| gemma-2b-it-int4-onnx-directml              | 2B         | 8192           | 2.314     | [EmbeddedLLM/gemma-2b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml)                      |
+| gemma-7b-it-int4-onnx-directml              | 7B         | 8192           | 5.958     | [EmbeddedLLM/gemma-7b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml)                      |
+| llama-2-7b-chat-int4-onnx-directml          | 7B         | 4096           | 3.708     | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml)              |
+| Starling-LM-7b-beta-int4-onnx-directml      | 7B         | 8192           | 3.974     | [EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml)     |
+| openchat-3.6-8b-20240522-int4-onnx-directml | 8B         | 8192           | 4.922     | [EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml) |
+| Yi-1.5-6B-Chat-int4-onnx-directml           | 6B         | 32768          | 3.532     | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml)  |
+
diff --git a/docs/model/onnxruntime_models.md b/docs/model/onnxruntime_models.md
deleted file mode 100644
index 4d61ffe..0000000
--- a/docs/model/onnxruntime_models.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Model Powered by Onnxruntime GenAI
-
-## Supported Models
-
-| Models | Parameters | Context Length | Link |
-| --- | --- | --- | --- |
-| Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
-| Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
-| Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
-| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
-| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
-| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) |
-| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) |
-| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) |
-| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) |
-| Phi-3-vision-128k-instruct |  | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) |

From 0965d51145a0b62cac1fe7d36300b2ac8ff9a038 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Tue, 6 Aug 2024 17:46:16 +0800
Subject: [PATCH 04/21] change the context length of 128k to 131072

---
 src/embeddedllm/entrypoints/modelui.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index 212c67b..1bbf9de 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -79,7 +79,7 @@ class ModelCard(BaseModel):
         model_name="Phi-3-mini-128k-instruct",
         subfolder=".",
         repo_type="model",
-        context_length=4096,
+        context_length=131072,
     ),
     "microsoft/Phi-3-medium-4k-instruct": ModelCard(
         hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main",
@@ -95,7 +95,7 @@ class ModelCard(BaseModel):
         model_name="Phi-3-medium-128k-instruct",
         subfolder=".",
         repo_type="model",
-        context_length=4096,
+        context_length=131072,
     ),
 }
 
@@ -114,7 +114,7 @@ class ModelCard(BaseModel):
         model_name="Phi-3-mini-128k-instruct-onnx-directml",
         subfolder=".",
         repo_type="model",
-        context_length=4096,
+        context_length=131072,
     ),
     "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard(
         hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main",
@@ -130,7 +130,7 @@ class ModelCard(BaseModel):
         model_name="Phi-3-medium-128k-instruct-onnx-directml",
         subfolder=".",
         repo_type="model",
-        context_length=4096,
+        context_length=131072,
     ),
     "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard(
         hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main",
@@ -221,7 +221,7 @@ class ModelCard(BaseModel):
         model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32",
         subfolder=".",
         repo_type="model",
-        context_length=4096,
+        context_length=131072,
     ),
     "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
         hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",
@@ -229,7 +229,7 @@ class ModelCard(BaseModel):
         model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4",
         subfolder=".",
         repo_type="model",
-        context_length=4096,
+        context_length=131072,
     ),
     "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard(
         hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main",

From 5dbf495e8f996b8322479e86f2fbe13ee136cd6a Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Tue, 13 Aug 2024 15:53:25 +0800
Subject: [PATCH 05/21] onnx auto download model if repo id is provided as
 model path

---
 src/embeddedllm/backend/onnxruntime_engine.py | 11 ++++++++
 src/embeddedllm/entrypoints/modelui.py        | 25 ++++++++++++++++---
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/embeddedllm/backend/onnxruntime_engine.py b/src/embeddedllm/backend/onnxruntime_engine.py
index 82b5dca..95d13c3 100644
--- a/src/embeddedllm/backend/onnxruntime_engine.py
+++ b/src/embeddedllm/backend/onnxruntime_engine.py
@@ -1,9 +1,11 @@
 # from embeddedllm.transformers_utils.image_processing_phi3v import Phi3VImageProcessor
 import contextlib
 import time
+import os
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import AsyncIterator, List, Optional
+from huggingface_hub import snapshot_download
 
 import onnxruntime_genai as og
 from loguru import logger
@@ -39,6 +41,15 @@ def onnx_generator_context(model, params):
 class OnnxruntimeEngine(BaseLLMEngine):
     def __init__(self, model_path: str, vision: bool, device: str = "cpu"):
         self.model_path = model_path
+
+        if not os.path.exists(model_path):
+            snapshot_path = snapshot_download(
+                repo_id=model_path,
+                allow_patterns=None,
+                repo_type="model",
+            )
+            model_path = snapshot_path
+
         self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True)
         self.device = device
 
diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index 1bbf9de..66fdd20 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -441,32 +441,49 @@ def deploy_model(engine_type, model_name, port_number):
         repo_type="model",
     )
 
-    model_path = os.path.join(snapshot_path, llm_model_card.subfolder)
+    if llm_model_card.subfolder != ".":
+        model_path = os.path.join(snapshot_path, llm_model_card.subfolder)
+    else:
+        model_path = snapshot_path
+
+    print("Model path:",model_path)
 
+    if engine_type == 'Ipex':
+        device = 'xpu'
+    
+    else:
+        device = 'cpu'
+    
     deployed_model.process = subprocess.Popen(
         [
             "ellm_server",
             "--model_path",
             model_path,
+            "--backend",
+            backend,
+            "--device",
+            device,
             "--port",
             f"{port_number}",
-            "--served_model_name",
-            model_name,
+            # "--served_model_name",
+            # model_name
         ]
     )
+
     deployed_model.model_name = model_name
 
     while True:
         # ping the server to see if it is up.
         if check_health(f"http://localhost:{port_number}/health"):
             break
-
+    
     deployment_message = f"""
     <div style="padding: 10px; background-color: #58DE3A; border-radius: 5px;">
         <h2 style="color: #2D2363;">Deployment Status:</h2>
         <p style="color: #2D2363;"><strong>Model:</strong> {model_name}</p>
         <p style="color: #2D2363;"><strong>Engine:</strong> {engine_type}</p>
         <p style="color: #2D2363;"><strong>Port:</strong> {port_number}</p>
+        <p style="color: #2D2363;"><strong>Model Path:</strong> {model_path}</p>
     </div>
     """
 

From fb2c63ebad4b4a71c8a7eedcda2a3f0cb45e5e31 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Tue, 13 Aug 2024 17:47:03 +0800
Subject: [PATCH 06/21] formated with black

---
 src/embeddedllm/entrypoints/modelui.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index 66fdd20..3a62e04 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -64,6 +64,7 @@ class ModelCard(BaseModel):
     context_length: int
     size: Optional[int] = 0
 
+
 ipex_model_dict_list = {
     "microsoft/Phi-3-mini-4k-instruct": ModelCard(
         hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/",
@@ -437,7 +438,9 @@ def deploy_model(engine_type, model_name, port_number):
 
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None,
+        allow_patterns=(
+            f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None
+        ),
         repo_type="model",
     )
 
@@ -446,14 +449,14 @@ def deploy_model(engine_type, model_name, port_number):
     else:
         model_path = snapshot_path
 
-    print("Model path:",model_path)
+    print("Model path:", model_path)
+
+    if engine_type == "Ipex":
+        device = "xpu"
 
-    if engine_type == 'Ipex':
-        device = 'xpu'
-    
     else:
-        device = 'cpu'
-    
+        device = "cpu"
+
     deployed_model.process = subprocess.Popen(
         [
             "ellm_server",
@@ -476,7 +479,7 @@ def deploy_model(engine_type, model_name, port_number):
         # ping the server to see if it is up.
         if check_health(f"http://localhost:{port_number}/health"):
             break
-    
+
     deployment_message = f"""
     <div style="padding: 10px; background-color: #58DE3A; border-radius: 5px;">
         <h2 style="color: #2D2363;">Deployment Status:</h2>
@@ -523,7 +526,9 @@ def download_model(engine_type, model_name):
     yield "Downloading ..."
     snapshot_path = snapshot_download(
         repo_id=llm_model_card.repo_id,
-        allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None,
+        allow_patterns=(
+            f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None
+        ),
         repo_type="model",
     )
     yield snapshot_path
@@ -561,7 +566,7 @@ def main():
             default_value = "Ipex"
         selected_engine_type = gr.Dropdown(
             choices=["DirectML", "Ipex", "CPU"],
-            value = default_value,
+            value=default_value,
             multiselect=False,
             label="LLM Engine",
             show_label=True,

From f8c8f27d4eb410ddfbd1886e3d916ab07adc27e3 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Wed, 14 Aug 2024 11:13:19 +0800
Subject: [PATCH 07/21] fixed with flake8

---
 src/embeddedllm/inputs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/embeddedllm/inputs.py b/src/embeddedllm/inputs.py
index 9797d05..8f05498 100644
--- a/src/embeddedllm/inputs.py
+++ b/src/embeddedllm/inputs.py
@@ -23,13 +23,13 @@ class ImagePixelData(TypedDict):
 
 # https://github.com/vllm-project/vllm/pull/4028
 @overload
-def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
-    ...
+def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ...
 
 
 @overload
-def parse_and_batch_prompt(prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
-    ...
+def parse_and_batch_prompt(
+    prompt: Union[List[int], List[List[int]]]
+) -> Sequence[ParsedTokens]: ...
 
 
 def parse_and_batch_prompt(

From d54b4d8d673717388c2d97e3d62dd360220371c2 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Wed, 14 Aug 2024 11:13:45 +0800
Subject: [PATCH 08/21] add openvino description and the device gpu

---
 src/embeddedllm/engine.py                 | 2 +-
 src/embeddedllm/entrypoints/api_server.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py
index 3eac11c..86f589c 100644
--- a/src/embeddedllm/engine.py
+++ b/src/embeddedllm/engine.py
@@ -80,7 +80,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend:
 
         else:
             raise ValueError(
-                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda` and `directml`."
+                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`."
             )
         self.tokenizer = self.engine.tokenizer
 
diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py
index 9385f24..efc2916 100644
--- a/src/embeddedllm/entrypoints/api_server.py
+++ b/src/embeddedllm/entrypoints/api_server.py
@@ -28,9 +28,9 @@ class Config(BaseSettings):
     )
     port: int = Field(default=6979, description="Server port.")
     host: str = Field(default="0.0.0.0", description="Server host.")
-    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`")
+    device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`, `gpu`")
     backend: str = Field(
-        default="directml", description="Backend engine: `cpu`, `ipex` and `directml`"
+        default="directml", description="Backend engine: `cpu`, `ipex`, `openvino` and `directml`"
     )
     response_role: str = Field(default="assistant", description="Server response role.")
     uvicorn_log_level: str = Field(

From 1c3b393ba10aac9af7745b0e45ecd3498e86cb61 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Wed, 14 Aug 2024 14:48:53 +0800
Subject: [PATCH 09/21] update openvino in modelui list

---
 src/embeddedllm/entrypoints/modelui.py | 88 +++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index 3a62e04..cc1e15c 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -20,7 +20,7 @@ def get_embeddedllm_backend():
         version = importlib.metadata.version("embeddedllm")
 
         # Use regex to extract the backend
-        match = re.search(r"\+(directml|cpu|cuda|ipex)$", version)
+        match = re.search(r"\+(directml|cpu|cuda|ipex|openvino)$", version)
 
         if match:
             backend = match.group(1)
@@ -65,6 +65,73 @@ class ModelCard(BaseModel):
     size: Optional[int] = 0
 
 
+openvino_model_dict_list = {
+    # "OpenVINO/Phi-3-mini-128k-instruct-int4-ov": ModelCard(
+    #     hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int4-ov/tree/main/",
+    #     repo_id="OpenVINO/Phi-3-mini-128k-instruct-int4-ov",
+    #     model_name="Phi-3-mini-128k-instruct-int4-ov",
+    #     subfolder=".",
+    #     repo_type="model",
+    #     context_length=131072,
+    # ),
+    "OpenVINO/Phi-3-mini-128k-instruct-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int8-ov/tree/main/",
+        repo_id="OpenVINO/Phi-3-mini-128k-instruct-int8-ov",
+        model_name="Phi-3-mini-128k-instruct-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    # "OpenVINO/Phi-3-mini-4k-instruct-int4-ov": ModelCard(
+    #     hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov/tree/main/",
+    #     repo_id="OpenVINO/Phi-3-mini-4k-instruct-int4-ov",
+    #     model_name="Phi-3-mini-4k-instruct-int4-ov",
+    #     subfolder=".",
+    #     repo_type="model",
+    #     context_length=4096,
+    # ),
+    "OpenVINO/Phi-3-mini-4k-instruct-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int8-ov/tree/main/",
+        repo_id="OpenVINO/Phi-3-mini-4k-instruct-int8-ov",
+        model_name="Phi-3-mini-4k-instruct-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    # "OpenVINO/Phi-3-medium-4k-instruct-int4-ov": ModelCard(
+    #     hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int4-ov/tree/main/",
+    #     repo_id="OpenVINO/Phi-3-medium-4k-instruct-int4-ov",
+    #     model_name="Phi-3-medium-4k-instruct-int4-ov",
+    #     subfolder=".",
+    #     repo_type="model",
+    #     context_length=4096,
+    # ),
+    "OpenVINO/Phi-3-medium-4k-instruct-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int8-ov/tree/main/",
+        repo_id="OpenVINO/Phi-3-medium-4k-instruct-int8-ov",
+        model_name="Phi-3-medium-4k-instruct-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "OpenVINO/open_llama_7b_v2-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/open_llama_7b_v2-int8-ov/tree/main/",
+        repo_id="OpenVINO/open_llama_7b_v2-int8-ov",
+        model_name="open_llama_7b_v2-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=2048,
+    ),
+    "OpenVINO/open_llama_3b_v2-int8-ov": ModelCard(
+        hf_url="https://huggingface.co/OpenVINO/open_llama_3b_v2-int8-ov/tree/main/",
+        repo_id="OpenVINO/open_llama_3b_v2-int8-ov",
+        model_name="open_llama_3b_v2-int8-ov",
+        subfolder=".",
+        repo_type="model",
+        context_length=2048,
+    ),
+}
+
 ipex_model_dict_list = {
     "microsoft/Phi-3-mini-4k-instruct": ModelCard(
         hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/",
@@ -319,6 +386,11 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"):
         repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
     )
 
+for k, v in openvino_model_dict_list.items():
+    v.size = compute_memory_size(
+        repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
+    )
+
 
 def convert_to_dataframe(model_dict_list):
     # Create lists to store the data
@@ -409,6 +481,9 @@ def update_model_list(engine_type):
     elif engine_type == "Ipex":
         models = sorted(list(ipex_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(ipex_model_dict_list)
+    elif engine_type == 'OpenVino':
+        models = sorted(list(openvino_model_dict_list.keys()))
+        models_pandas = convert_to_dataframe(openvino_model_dict_list)
     else:
         models = sorted(list(cpu_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(cpu_model_dict_list)
@@ -433,6 +508,8 @@ def deploy_model(engine_type, model_name, port_number):
         llm_model_card = dml_model_dict_list[model_name]
     elif engine_type == "Ipex":
         llm_model_card = ipex_model_dict_list[model_name]
+    elif engine_type == "OpenVino":
+        llm_model_card = openvino_model_dict_list[model_name]
     else:
         llm_model_card = cpu_model_dict_list[model_name]
 
@@ -453,7 +530,8 @@ def deploy_model(engine_type, model_name, port_number):
 
     if engine_type == "Ipex":
         device = "xpu"
-
+    elif engine_type == "OpenVino":
+        device = "gpu"
     else:
         device = "cpu"
 
@@ -516,6 +594,8 @@ def download_model(engine_type, model_name):
         llm_model_card = dml_model_dict_list[model_name]
     elif engine_type == "Ipex":
         llm_model_card = ipex_model_dict_list[model_name]
+    elif engine_type == "OpenVino":
+        llm_model_card = openvino_model_dict_list[model_name]
     else:
         llm_model_card = cpu_model_dict_list[model_name]
 
@@ -564,8 +644,10 @@ def main():
             default_value = "DirectML"
         elif backend == "ipex":
             default_value = "Ipex"
+        elif backend == "openvino":
+            default_value = "OpenVino"
         selected_engine_type = gr.Dropdown(
-            choices=["DirectML", "Ipex", "CPU"],
+            choices=["DirectML", "Ipex", "OpenVino", "CPU"],
             value=default_value,
             multiselect=False,
             label="LLM Engine",

From 75eff7cf6c1155c5362f0c7a29d7eff04ce43709 Mon Sep 17 00:00:00 2001
From: szeyu <szeyusim@gmail.com>
Date: Thu, 15 Aug 2024 14:43:44 +0800
Subject: [PATCH 10/21] first commit of benchmark code

---
 benchmark/analyse_detailed_benchmark.py      | 124 +++++++++++++++++++
 benchmark/ellm_benchmark.py                  | 117 +++++++++++++++++
 benchmark/loop_analyse_detailed_benchmark.py |  20 +++
 benchmark/loop_ellm_benchmark.py             |  57 +++++++++
 benchmark/sampleText.txt                     |  91 ++++++++++++++
 5 files changed, 409 insertions(+)
 create mode 100644 benchmark/analyse_detailed_benchmark.py
 create mode 100644 benchmark/ellm_benchmark.py
 create mode 100644 benchmark/loop_analyse_detailed_benchmark.py
 create mode 100644 benchmark/loop_ellm_benchmark.py
 create mode 100644 benchmark/sampleText.txt

diff --git a/benchmark/analyse_detailed_benchmark.py b/benchmark/analyse_detailed_benchmark.py
new file mode 100644
index 0000000..ca45d30
--- /dev/null
+++ b/benchmark/analyse_detailed_benchmark.py
@@ -0,0 +1,124 @@
+import os
+import re
+import numpy as np
+import pandas as pd
+import argparse
+
+def extract_data_from_log(log_file):
+    average_tps_list = []
+    prompt_tokens_per_second_list = []
+    new_tokens_per_second_list = []
+    error_count = 0
+    error_state = False
+
+    if not os.path.exists(log_file):
+        print(f"Log file does not exist: {log_file}")
+        return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count
+
+    with open(log_file, 'r') as file:
+        for line in file:
+            if "ERROR" in line:
+                error_count += 1
+                error_state = True
+                continue
+
+            if "Average tps" in line and error_state == True:
+                error_state = False
+                continue
+
+            if "Average tps" in line:
+                average_tps = float(re.search(r"Average tps: ([\d.]+)", line).group(1))
+                average_tps_list.append(average_tps)
+                continue
+
+            if "Prompt tokens per second" in line:
+                prompt_tokens_per_second = float(re.search(r"Prompt tokens per second: ([\d.]+)", line).group(1))
+                prompt_tokens_per_second_list.append(prompt_tokens_per_second)
+            if "New tokens per second" in line:
+                new_tokens_per_second = float(re.search(r"New tokens per second: ([\d.]+)", line).group(1))
+                new_tokens_per_second_list.append(new_tokens_per_second)
+
+    return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count
+
+def calculate_statistics(data):
+    data_np = np.array(data)
+    stats = {
+        "std": np.std(data_np, ddof=1),  # Sample standard deviation
+        "mean": np.mean(data_np),
+        "min": np.min(data_np),
+        "1%": np.percentile(data_np, 1),
+        "25%": np.percentile(data_np, 25),
+        "50%": np.percentile(data_np, 50),  # Median
+        "75%": np.percentile(data_np, 75),
+        "99%": np.percentile(data_np, 99),
+        "max": np.max(data_np)
+    }
+    return stats
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process log files and generate statistics.")
+    parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
+    return parser.parse_args()
+
+def main(model_name):
+    token_ins = [128, 256, 512, 1024]
+    token_outs = [128, 256, 512, 1024]
+
+    statistics = []
+
+    # Create the profile_model_timing directory if it doesn't exist
+    log_dir = "profile_model_timing"
+    os.makedirs(log_dir, exist_ok=True)
+
+    for input_token_length in token_ins:
+        for output_token_length in token_outs:
+            log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log')
+            average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count = extract_data_from_log(log_file)
+
+            if not average_tps_list and not prompt_tokens_per_second_list and not new_tokens_per_second_list:
+                # Log file does not exist or is empty, append "-" for each statistical value
+                statistics.append([
+                    model_name, input_token_length, output_token_length,
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    error_count
+                ])
+            else:
+                min_len = min(len(average_tps_list), len(prompt_tokens_per_second_list), len(new_tokens_per_second_list))
+
+                if min_len > 0:
+                    prompt_stats = calculate_statistics(prompt_tokens_per_second_list[5:min_len])
+                    new_token_stats = calculate_statistics(new_tokens_per_second_list[5:min_len])
+                    average_tps_stats = calculate_statistics(average_tps_list[5:min_len])
+
+                    statistics.append([
+                        model_name, input_token_length, output_token_length,
+                        prompt_stats["std"], prompt_stats["mean"], prompt_stats["min"], prompt_stats["1%"], prompt_stats["25%"], prompt_stats["50%"], prompt_stats["75%"], prompt_stats["99%"], prompt_stats["max"],
+                        new_token_stats["std"], new_token_stats["mean"], new_token_stats["min"], new_token_stats["1%"], new_token_stats["25%"], new_token_stats["50%"], new_token_stats["75%"], new_token_stats["99%"], new_token_stats["max"],
+                        average_tps_stats["std"], average_tps_stats["mean"], average_tps_stats["min"], average_tps_stats["1%"], average_tps_stats["25%"], average_tps_stats["50%"], average_tps_stats["75%"], average_tps_stats["99%"], average_tps_stats["max"],
+                        error_count
+                    ])
+
+    # Create a DataFrame
+    columns = [
+        "Model", "Token In", "Token Out",
+        "Token In / sec std", "Token In / sec mean", "Token In / sec min", "Token In / sec 1%", "Token In / sec 25%", "Token In / sec 50%", "Token In / sec 75%", "Token In / sec 99%", "Token In / sec max",
+        "Token Out / sec std", "Token Out / sec mean", "Token Out / sec min", "Token Out / sec 1%", "Token Out / sec 25%", "Token Out / sec 50%", "Token Out / sec 75%", "Token Out / sec 99%", "Token Out / sec max",
+        "Average Token / sec std", "Average Token / sec mean", "Average Token / sec min", "Average Token / sec 1%", "Average Token / sec 25%", "Average Token / sec 50%", "Average Token / sec 75%", "Average Token / sec 99%", "Average Token / sec max",
+        "No of Fail"
+    ]
+    df = pd.DataFrame(statistics, columns=columns)
+
+    # Create the statistics directory if it doesn't exist
+    output_dir = "statistics"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Write to Excel
+    output_file = os.path.join(output_dir, f"{model_name}_statistics.xlsx")
+    df.to_excel(output_file, index=False)
+    print(f"Statistics written to {output_file}")
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args.model_name)
diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
new file mode 100644
index 0000000..d41023d
--- /dev/null
+++ b/benchmark/ellm_benchmark.py
@@ -0,0 +1,117 @@
+import sys
+import os
+import time
+import asyncio
+import argparse
+from loguru import logger
+
+# Add the 'src' directory to sys.path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src')))
+
+# Import the engine module
+from embeddedllm import engine
+from embeddedllm import sampling_params
+
+async def benchmark(input_token_length, output_token_length, model_path, model_name, backend):
+    # Create the profile_model_timing directory if it doesn't exist
+    log_dir = "profile_model_timing"
+    os.makedirs(log_dir, exist_ok=True)
+
+    log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log')
+
+    # Add the log file to the logger (it will append if the file already exists)
+    logger.add(log_file, mode='a')
+
+    # need different parameter for cpu and directml
+    if backend == "cpu":
+        device="cpu"
+    elif backend == "ipex":
+        device="xpu"
+    elif backend == "openvino":
+        device="gpu"
+    elif backend == "directml":
+        device = ""
+
+    model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend)
+
+    logger.info(f"Model: {model_name}")
+
+    model.tokenizer.chat_template = "{% for message in messages %}{{  message['content']}}{% endfor %}"  # Override
+
+    prompt_text = """
+
+    """
+    # Define the path to the file
+    file_path = "sampleText.txt"
+
+    # Open the file and read its contents into the variable
+    with open(file_path, 'r') as file:
+        prompt_text = file.read()
+
+    input_tokens = model.tokenizer.encode(prompt_text)[:input_token_length-1]
+    input_text = model.tokenizer.decode(input_tokens)
+    print(input_text)
+    input_tokens = model.tokenizer.encode(input_text)
+    print(len(input_tokens))
+
+    assert input_token_length-1 == len(input_tokens)
+
+    PromptInputs = {
+        "prompt": input_text
+    }
+
+    sampling_params_config = sampling_params.SamplingParams(
+        max_tokens=output_token_length,
+        top_p=0.1,
+        top_k=1,
+        temperature=1,
+        repetition_penalty=0.01,
+    )
+
+    start = time.perf_counter()
+
+    async def generate():
+        results = []
+        async for response in model.generate(
+            inputs=PromptInputs,
+            sampling_params=sampling_params_config,
+            request_id="benchmark",
+            stream=True,
+        ):
+            results.append(response)
+        return results
+
+    response = await generate()
+    end = time.perf_counter()
+
+    logger.info(response[0])  # Access the generated text from the response
+
+    total_time_taken = end - start
+    logger.info(f"Total time taken: {total_time_taken:.2f} seconds")
+
+    average_tps = (input_token_length + output_token_length) / total_time_taken
+    logger.info("Average tps: "+ str(average_tps))
+
+    # Remove the logger to close the log file
+    logger.remove()
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.")
+    parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, ipex, openvino or directml)')
+    parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the model')
+    parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)')
+    parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens')
+
+    args = parser.parse_args()
+
+    # Cap the input tokens to 2048
+    if args.token_in > 2048:
+        print("Input tokens capped to 2048.")
+        args.token_in = 2048
+
+    # Run the async function using asyncio.run()
+    asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend))
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/benchmark/loop_analyse_detailed_benchmark.py b/benchmark/loop_analyse_detailed_benchmark.py
new file mode 100644
index 0000000..e01bdda
--- /dev/null
+++ b/benchmark/loop_analyse_detailed_benchmark.py
@@ -0,0 +1,20 @@
+import subprocess
+
+model_names = [
+    # model names
+    
+]
+
+
+# Path to the ellm_benchmark.py script
+analyse_detailed_benchmark_script = "analyse_detailed_benchmark.py"
+
+for model_name in model_names:
+    # Construct the command
+    command = [
+        "python", analyse_detailed_benchmark_script,
+        "--model_name", model_name,
+    ]
+
+    # Execute the command
+    subprocess.run(command)
\ No newline at end of file
diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
new file mode 100644
index 0000000..16cb47e
--- /dev/null
+++ b/benchmark/loop_ellm_benchmark.py
@@ -0,0 +1,57 @@
+import subprocess
+
+# Define the models and token lengths
+model_names = [
+    # model names
+]
+
+model_paths = [
+    # path to model in order to model names / model repo id
+]
+
+token_in_out = [
+    (1024, 1024),
+    (1024, 512),
+    (1024, 256),
+    (1024, 128),
+    (512, 1024),
+    (512, 512),
+    (512, 256),
+    (512, 128),
+    (256, 1024),
+    (256, 512),
+    (256, 256),
+    (256, 128),
+    (128, 1024),
+    (128, 512),
+    (128, 256),
+    (128, 128),
+]
+
+# Choose backend
+# backend = "cpu"
+# backend = "directml"
+# backend = "ipex"
+# backend = "openvino"
+
+# Number of loops
+loop_count = 20
+
+# Path to the ellm_benchmark.py script
+ellm_benchmark_script = "ellm_benchmark.py"
+
+for model_name, model_path in zip(model_names, model_paths):
+    for input_token_length, output_token_length in token_in_out:
+        for i in range(loop_count):
+            # Construct the command
+            command = [
+                "python", ellm_benchmark_script,
+                "--backend", backend,
+                "--model_name", model_name,
+                "--model_path", model_path,
+                "--token_in", str(input_token_length),
+                "--token_out", str(output_token_length)
+            ]
+
+            # Execute the command
+            subprocess.run(command)
diff --git a/benchmark/sampleText.txt b/benchmark/sampleText.txt
new file mode 100644
index 0000000..3da3fbb
--- /dev/null
+++ b/benchmark/sampleText.txt
@@ -0,0 +1,91 @@
+A large language model (LLM) is a computational model notable for its ability to achieve general-purpose language 
+generation and other natural language processing tasks such as classification. Based on language models, LLMs acquire 
+these abilities by learning statistical relationships from vast amounts of text during a computationally intensive 
+self-supervised and semi-supervised training process.[1] LLMs can be used for text generation, a form of generative AI, 
+by taking an input text and repeatedly predicting the next token or word.[2]
+
+LLMs are artificial neural networks that utilize the transformer architecture, invented in 2017. The largest and 
+most capable LLMs, as of June 2024, are built with a decoder-only transformer-based architecture, which enables 
+efficient processing and generation of large-scale text data.
+
+Historically, up to 2020, fine-tuning was the primary method used to adapt a model for specific tasks. However, 
+larger models such as GPT-3 have demonstrated the ability to achieve similar results through prompt engineering, 
+which involves crafting specific input prompts to guide the model's responses.[3] These models acquire knowledge 
+about syntax, semantics, and ontologies[4] inherent in human language corpora, but they also inherit inaccuracies 
+and biases present in the data they are trained on.[5]
+
+Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), 
+Google's Gemini (the latter of which is currently used in the chatbot of the same name), Meta's LLaMA family of models, 
+Anthropic's Claude models, and Mistral AI's models.
+
+History
+Before 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, 
+the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 
+billion words achieved then-SOTA perplexity.[6] In the 2000s, as Internet use became prevalent, some researchers 
+constructed Internet-scale language datasets ("web as corpus"[7]), upon which they trained statistical language 
+models.[8][9] In 2009, in most language processing tasks, statistical language models dominated over symbolic 
+language models, as they can usefully ingest large datasets.[10]
+
+After neural networks became dominant in image processing around 2012, they were applied to language modelling as 
+well. Google converted its translation service to Neural Machine Translation in 2016. As it was before Transformers, 
+it was done by seq2seq deep LSTM networks.
+
+
+An illustration of main components of the transformer model from the original paper, where layers were normalized 
+after (instead of before) multiheaded attention At the 2017 NeurIPS conference, Google researchers introduced the 
+transformer architecture in their landmark paper "Attention Is All You Need". This paper's goal was to improve upon 
+2014 Seq2seq technology,[11] and was based mainly on the attention mechanism developed by Bahdanau et al. in 2014.
+[12] The following year in 2018, BERT was introduced and quickly became "ubiquitous".[13] Though the original 
+transformer has both encoder and decoder blocks, BERT is an encoder-only model.
+
+Although decoder-only GPT-1 was introduced in 2018, it was GPT-2 in 2019 that caught widespread attention because 
+OpenAI at first deemed it too powerful to release publicly, out of fear of malicious use.[14] GPT-3 in 2020 went 
+a step further and as of 2024 is available only via API with no offering of downloading the model to execute locally. 
+But it was the 2022 consumer-facing browser-based ChatGPT that captured the imaginations of the general population 
+and caused some media hype and online buzz.[15] The 2023 GPT-4 was praised for its increased accuracy and as a 
+"holy grail" for its multimodal capabilities.[16] OpenAI did not reveal high-level architecture and the number 
+of parameters of GPT-4.
+
+Competing language models have for the most part been attempting to equal the GPT series, at least in terms of 
+number of parameters.[17]
+
+Since 2022, source-available models have been gaining popularity, especially at first with BLOOM and LLaMA, though 
+both have restrictions on the field of use. Mistral AI's models Mistral 7B and Mixtral 8x7b have the more permissive 
+Apache License. As of June 2024, The Instruction fine tuned variant of the Llama 3 70 billion parameter model is 
+the most powerful open LLM according to the LMSYS Chatbot Arena Leaderboard, being more powerful than GPT-3.5 but 
+not as powerful as GPT-4.[18]
+
+As of 2024, the largest and most capable models are all based on the Transformer architecture. Some recent 
+implementations are based on other architectures, such as recurrent neural network variants and Mamba 
+(a state space model).[19][20][21]
+
+Dataset preprocessing
+See also: List of datasets for machine-learning research § Internet
+Probabilistic tokenization
+Because machine learning algorithms process numbers rather than text, the text must be converted to numbers. 
+In the first step, a vocabulary is decided upon, then integer indexes are arbitrarily but uniquely assigned 
+to each vocabulary entry, and finally, an embedding is associated to the integer index. Algorithms include 
+byte-pair encoding and WordPiece.
+
+Probabilistic tokenization also compresses the datasets. Because LLMs generally require input to be an array 
+that is not jagged, the shorter texts must be "padded" until they match the length of the longest one. How many 
+tokens are, on average, needed per word depends on the language of the dataset.[22][23]
+
+BPE
+Using a modification of byte-pair encoding, in the first step, all unique characters (including blanks and 
+punctuation marks) are treated as an initial set of n-grams (i.e. initial set of uni-grams). Successively 
+the most frequent pair of adjacent characters is merged into a bi-gram and all instances of the pair are 
+replaced by it. All occurrences of adjacent pairs of (previously merged) n-grams that most frequently occur 
+together are then again merged into even lengthier n-gram repeatedly until a vocabulary of prescribed size 
+is obtained (in case of GPT-3, the size is 50257).[24] Token vocabulary consists of integers, spanning from 
+zero up to the size of the token vocabulary. New words can always be interpreted as combinations of the 
+tokens and the initial-set uni-grams.[25]
+
+A token vocabulary based on the frequencies extracted from mainly English corpora uses as few tokens as 
+possible for an average English word. An average word in another language encoded by such an English-optimized 
+tokenizer is however split into suboptimal amount of tokens. GPT-2 tokenizer can use up to 15 times more tokens 
+per word for some languages, for example for the Shan language from Myanmar. Even more widespread languages 
+such as Portuguese and German have "a premium of 50%" compared to English.[26]
+
+For example, here is how tokenizer used by GPT-3 (Legacy) split the following sentence tokenizer: texts -> 
+series of numerical "tokens".
\ No newline at end of file

From 608670ce2af5cf531a0d6c9b92c75095ebbd95ed Mon Sep 17 00:00:00 2001
From: szeyu <szeyusim@gmail.com>
Date: Thu, 15 Aug 2024 15:11:14 +0800
Subject: [PATCH 11/21] update for the markdown to teach about benchmark code
 usage

---
 benchmark/benchmark.md           | 81 ++++++++++++++++++++++++++++++++
 benchmark/ellm_benchmark.py      |  2 +-
 benchmark/loop_ellm_benchmark.py |  6 ++-
 3 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 benchmark/benchmark.md

diff --git a/benchmark/benchmark.md b/benchmark/benchmark.md
new file mode 100644
index 0000000..ef8f1bd
--- /dev/null
+++ b/benchmark/benchmark.md
@@ -0,0 +1,81 @@
+# Benchmark
+
+## Benchmark a Model
+To benchmark a model, run this
+* --backend `cpu` | `ipex` | `openvino` | `directml`
+* --model_name `Name of the Model`
+* --model_path `Path to Model` | `Model Repo ID`
+* --token_in `Number of Input Tokens (Max 2048)`
+* --token_out `Number of Output Tokens`
+
+```shell
+python ellm_benchmark.py --backend <cpu | ipex | openvino | directml> --model_name <Name of the Model> --model_path <Path to Model | Model Repo ID> --token_in <Number of Input Tokens (Max 2048)> --token_out <Number of Output Tokens>
+```
+
+
+## Loop to benchmark the models
+Customise your benchmarking config
+```python
+# Define the models
+model_names = [
+    # model names
+
+]
+
+# Define the model paths
+model_paths = [
+    # path to model in order to model names / model repo id
+
+]
+
+# Define the token length
+token_in_out = [
+    (1024, 1024),
+    (1024, 512),
+    (1024, 256),
+    (1024, 128),
+    (512, 1024),
+    (512, 512),
+    (512, 256),
+    (512, 128),
+    (256, 1024),
+    (256, 512),
+    (256, 256),
+    (256, 128),
+    (128, 1024),
+    (128, 512),
+    (128, 256),
+    (128, 128),
+]
+
+# Choose backend
+backend = "cpu"
+backend = "directml"
+backend = "ipex"
+backend = "openvino"
+
+# Number of loops
+loop_count = 20
+```
+```shell
+python loop_ellm_benchmark.py
+```
+
+## Generate a Report (`XLSX`) of a Model's Benchmark
+To Generate report for a model, run this
+* --model_name `Name of the Model`
+```shell
+python analyse_detailed_benchmark.py --model_name <Name of the Model>
+```
+
+## Generate Reports (`XLSX`) of Models' Benchmark
+List out the models that you want to have report of benchmarking
+```python
+model_names = [
+    # model names
+    
+]
+```
+```shell
+python loop_analyse_detailed_benchmark.py
+```
\ No newline at end of file
diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
index d41023d..d46fd0a 100644
--- a/benchmark/ellm_benchmark.py
+++ b/benchmark/ellm_benchmark.py
@@ -99,7 +99,7 @@ def main():
     parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.")
     parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, ipex, openvino or directml)')
     parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
-    parser.add_argument('--model_path', type=str, required=True, help='Path to the model')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id')
     parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)')
     parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens')
 
diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
index 16cb47e..a385f55 100644
--- a/benchmark/loop_ellm_benchmark.py
+++ b/benchmark/loop_ellm_benchmark.py
@@ -1,14 +1,18 @@
 import subprocess
 
-# Define the models and token lengths
+# Define the models
 model_names = [
     # model names
+
 ]
 
+# Define the model paths
 model_paths = [
     # path to model in order to model names / model repo id
+
 ]
 
+# Define the token length
 token_in_out = [
     (1024, 1024),
     (1024, 512),

From 21d95aa54eb0b1e8cff9adcf8c70b7ead9da1c3e Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:13:33 +0800
Subject: [PATCH 12/21] Rename benchmark.md to README.md

---
 benchmark/{benchmark.md => README.md} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename benchmark/{benchmark.md => README.md} (99%)

diff --git a/benchmark/benchmark.md b/benchmark/README.md
similarity index 99%
rename from benchmark/benchmark.md
rename to benchmark/README.md
index ef8f1bd..e8ef2e9 100644
--- a/benchmark/benchmark.md
+++ b/benchmark/README.md
@@ -78,4 +78,4 @@ model_names = [
 ```
 ```shell
 python loop_analyse_detailed_benchmark.py
-```
\ No newline at end of file
+```

From a038376a6d7e5504d48f6d74da3a44b2a3c2a0b3 Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:22:04 +0800
Subject: [PATCH 13/21] Update README.md

---
 benchmark/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmark/README.md b/benchmark/README.md
index e8ef2e9..cc32b4f 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1,4 +1,5 @@
 # Benchmark
+Allow users to test on themselves to get the benchmark of model(s) on different backend. It will analyse the Token In / Out throughput for you in a statistical manner
 
 ## Benchmark a Model
 To benchmark a model, run this

From ca93ba991680c8b5e7b6fcb1957982cb7845276a Mon Sep 17 00:00:00 2001
From: szeyu <szeyusim@gmail.com>
Date: Thu, 15 Aug 2024 15:51:40 +0800
Subject: [PATCH 14/21] fixed the bias for encode and output_token_length for
 openvino

---
 benchmark/ellm_benchmark.py      | 14 +++++++++----
 benchmark/loop_ellm_benchmark.py | 36 ++++++++++++++++----------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
index d46fd0a..ab73107 100644
--- a/benchmark/ellm_benchmark.py
+++ b/benchmark/ellm_benchmark.py
@@ -22,6 +22,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
     # Add the log file to the logger (it will append if the file already exists)
     logger.add(log_file, mode='a')
 
+    encode_bias = 0
+    output_token_bias = 0
     # need different parameter for cpu and directml
     if backend == "cpu":
         device="cpu"
@@ -29,6 +31,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
         device="xpu"
     elif backend == "openvino":
         device="gpu"
+        encode_bias = 2
+        output_token_bias = 1
     elif backend == "directml":
         device = ""
 
@@ -48,20 +52,22 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
     with open(file_path, 'r') as file:
         prompt_text = file.read()
 
-    input_tokens = model.tokenizer.encode(prompt_text)[:input_token_length-1]
+    input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length - encode_bias)]
     input_text = model.tokenizer.decode(input_tokens)
     print(input_text)
     input_tokens = model.tokenizer.encode(input_text)
-    print(len(input_tokens))
+    
+    print("input_tokens:",len(input_tokens))
+    print("input_token_length:",input_token_length)
 
-    assert input_token_length-1 == len(input_tokens)
+    assert input_token_length == len(input_tokens)
 
     PromptInputs = {
         "prompt": input_text
     }
 
     sampling_params_config = sampling_params.SamplingParams(
-        max_tokens=output_token_length,
+        max_tokens=(output_token_length - output_token_bias),
         top_p=0.1,
         top_k=1,
         temperature=1,
diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
index a385f55..b6e3aaf 100644
--- a/benchmark/loop_ellm_benchmark.py
+++ b/benchmark/loop_ellm_benchmark.py
@@ -3,32 +3,32 @@
 # Define the models
 model_names = [
     # model names
-
+    "Phi-3-mini-4k-instruct-int8-ov"
 ]
 
 # Define the model paths
 model_paths = [
     # path to model in order to model names / model repo id
-
+    "OpenVINO/Phi-3-mini-4k-instruct-int8-ov"
 ]
 
 # Define the token length
 token_in_out = [
-    (1024, 1024),
-    (1024, 512),
-    (1024, 256),
-    (1024, 128),
-    (512, 1024),
-    (512, 512),
-    (512, 256),
-    (512, 128),
-    (256, 1024),
-    (256, 512),
-    (256, 256),
-    (256, 128),
-    (128, 1024),
-    (128, 512),
-    (128, 256),
+    # (1024, 1024),
+    # (1024, 512),
+    # (1024, 256),
+    # (1024, 128),
+    # (512, 1024),
+    # (512, 512),
+    # (512, 256),
+    # (512, 128),
+    # (256, 1024),
+    # (256, 512),
+    # (256, 256),
+    # (256, 128),
+    # (128, 1024),
+    # (128, 512),
+    # (128, 256),
     (128, 128),
 ]
 
@@ -36,7 +36,7 @@
 # backend = "cpu"
 # backend = "directml"
 # backend = "ipex"
-# backend = "openvino"
+backend = "openvino"
 
 # Number of loops
 loop_count = 20

From 632d651453b16ede2f4b2e1b1e12625a2ba228cf Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Thu, 15 Aug 2024 15:54:47 +0800
Subject: [PATCH 15/21] Update loop_ellm_benchmark.py

---
 benchmark/loop_ellm_benchmark.py | 36 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
index b6e3aaf..c9fe8c3 100644
--- a/benchmark/loop_ellm_benchmark.py
+++ b/benchmark/loop_ellm_benchmark.py
@@ -3,32 +3,32 @@
 # Define the models
 model_names = [
     # model names
-    "Phi-3-mini-4k-instruct-int8-ov"
+    
 ]
 
 # Define the model paths
 model_paths = [
     # path to model in order to model names / model repo id
-    "OpenVINO/Phi-3-mini-4k-instruct-int8-ov"
+
 ]
 
 # Define the token length
 token_in_out = [
-    # (1024, 1024),
-    # (1024, 512),
-    # (1024, 256),
-    # (1024, 128),
-    # (512, 1024),
-    # (512, 512),
-    # (512, 256),
-    # (512, 128),
-    # (256, 1024),
-    # (256, 512),
-    # (256, 256),
-    # (256, 128),
-    # (128, 1024),
-    # (128, 512),
-    # (128, 256),
+    (1024, 1024),
+    (1024, 512),
+    (1024, 256),
+    (1024, 128),
+    (512, 1024),
+    (512, 512),
+    (512, 256),
+    (512, 128),
+    (256, 1024),
+    (256, 512),
+    (256, 256),
+    (256, 128),
+    (128, 1024),
+    (128, 512),
+    (128, 256),
     (128, 128),
 ]
 
@@ -36,7 +36,7 @@
 # backend = "cpu"
 # backend = "directml"
 # backend = "ipex"
-backend = "openvino"
+# backend = "openvino"
 
 # Number of loops
 loop_count = 20

From e51527d9d6feec35ebe5fe2d9daff8813cf518e3 Mon Sep 17 00:00:00 2001
From: szeyusim <szeyusim@gmail.com>
Date: Thu, 15 Aug 2024 16:08:03 +0800
Subject: [PATCH 16/21] add prompt bias to fix the token encode margin error
 for directml

---
 benchmark/ellm_benchmark.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
index ab73107..7d7aead 100644
--- a/benchmark/ellm_benchmark.py
+++ b/benchmark/ellm_benchmark.py
@@ -23,6 +23,7 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
     logger.add(log_file, mode='a')
 
     encode_bias = 0
+    prompt_bias = 0
     output_token_bias = 0
     # need different parameter for cpu and directml
     if backend == "cpu":
@@ -35,6 +36,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
         output_token_bias = 1
     elif backend == "directml":
         device = ""
+        encode_bias = 1
+        prompt_bias = 1
 
     model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend)
 
@@ -57,10 +60,10 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
     print(input_text)
     input_tokens = model.tokenizer.encode(input_text)
     
-    print("input_tokens:",len(input_tokens))
+    print("input_tokens:",(prompt_bias + len(input_tokens)))
     print("input_token_length:",input_token_length)
 
-    assert input_token_length == len(input_tokens)
+    assert input_token_length == (prompt_bias + len(input_tokens))
 
     PromptInputs = {
         "prompt": input_text

From 013adc4d1877a0b39b98cec18d25b68d59138324 Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:13:25 +0800
Subject: [PATCH 17/21] Update ellm_benchmark.py

---
 benchmark/ellm_benchmark.py | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
index 7d7aead..7771768 100644
--- a/benchmark/ellm_benchmark.py
+++ b/benchmark/ellm_benchmark.py
@@ -12,7 +12,7 @@
 from embeddedllm import engine
 from embeddedllm import sampling_params
 
-async def benchmark(input_token_length, output_token_length, model_path, model_name, backend):
+async def benchmark(input_token_length, output_token_length, model_path, model_name, backend, input_token_bias=0, output_token_bias=0):
     # Create the profile_model_timing directory if it doesn't exist
     log_dir = "profile_model_timing"
     os.makedirs(log_dir, exist_ok=True)
@@ -22,9 +22,6 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
     # Add the log file to the logger (it will append if the file already exists)
     logger.add(log_file, mode='a')
 
-    encode_bias = 0
-    prompt_bias = 0
-    output_token_bias = 0
     # need different parameter for cpu and directml
     if backend == "cpu":
         device="cpu"
@@ -32,12 +29,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
         device="xpu"
     elif backend == "openvino":
         device="gpu"
-        encode_bias = 2
-        output_token_bias = 1
     elif backend == "directml":
         device = ""
-        encode_bias = 1
-        prompt_bias = 1
 
     model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend)
 
@@ -55,22 +48,17 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n
     with open(file_path, 'r') as file:
         prompt_text = file.read()
 
-    input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length - encode_bias)]
+    input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length + input_token_bias)]
     input_text = model.tokenizer.decode(input_tokens)
     print(input_text)
     input_tokens = model.tokenizer.encode(input_text)
-    
-    print("input_tokens:",(prompt_bias + len(input_tokens)))
-    print("input_token_length:",input_token_length)
-
-    assert input_token_length == (prompt_bias + len(input_tokens))
 
     PromptInputs = {
         "prompt": input_text
     }
 
     sampling_params_config = sampling_params.SamplingParams(
-        max_tokens=(output_token_length - output_token_bias),
+        max_tokens=(output_token_length + output_token_bias),
         top_p=0.1,
         top_k=1,
         temperature=1,
@@ -111,6 +99,8 @@ def main():
     parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id')
     parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)')
     parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens')
+    parser.add_argument('--input_token_bias', type=int, required=False, help='Adjust the input token length')
+    parser.add_argument('--output_token_bias', type=int, required=False, help='Adjust the output token length')
 
     args = parser.parse_args()
 
@@ -120,7 +110,7 @@ def main():
         args.token_in = 2048
 
     # Run the async function using asyncio.run()
-    asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend))
+    asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend, args.input_token_bias, args.output_token_bias))
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 62e0b2cee90d0e3d03b5d848bf7478acbeb2b664 Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:14:54 +0800
Subject: [PATCH 18/21] Update loop_ellm_benchmark.py

---
 benchmark/loop_ellm_benchmark.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
index c9fe8c3..658326e 100644
--- a/benchmark/loop_ellm_benchmark.py
+++ b/benchmark/loop_ellm_benchmark.py
@@ -41,6 +41,10 @@
 # Number of loops
 loop_count = 20
 
+# input and output token bias
+input_token_bias = 0
+output_token_bias = 0
+
 # Path to the ellm_benchmark.py script
 ellm_benchmark_script = "ellm_benchmark.py"
 
@@ -54,7 +58,9 @@
                 "--model_name", model_name,
                 "--model_path", model_path,
                 "--token_in", str(input_token_length),
-                "--token_out", str(output_token_length)
+                "--token_out", str(output_token_length),
+                "--input_token_bias", str(input_token_bias),
+                "--output_token_bias", str(output_token_bias)
             ]
 
             # Execute the command

From 03cc6b7aa52967c1df2643f9c64cd270af6f36ff Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:16:53 +0800
Subject: [PATCH 19/21] Update README.md

---
 benchmark/README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index cc32b4f..710b602 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -8,9 +8,11 @@ To benchmark a model, run this
 * --model_path `Path to Model` | `Model Repo ID`
 * --token_in `Number of Input Tokens (Max 2048)`
 * --token_out `Number of Output Tokens`
+* --input_token_bias `Adjust the input token`
+* --output_token_bias `Adjust the output token`
 
 ```shell
-python ellm_benchmark.py --backend <cpu | ipex | openvino | directml> --model_name <Name of the Model> --model_path <Path to Model | Model Repo ID> --token_in <Number of Input Tokens (Max 2048)> --token_out <Number of Output Tokens>
+python ellm_benchmark.py --backend <cpu | ipex | openvino | directml> --model_name <Name of the Model> --model_path <Path to Model | Model Repo ID> --token_in <Number of Input Tokens (Max 2048)> --token_out <Number of Output Tokens> --input_token_bias <int value> --output_token_bias <int value>
 ```
 
 
@@ -57,6 +59,10 @@ backend = "openvino"
 
 # Number of loops
 loop_count = 20
+
+# input and output token bias
+input_token_bias = 0
+output_token_bias = 0
 ```
 ```shell
 python loop_ellm_benchmark.py

From 4998e2cf889f05aaa298e8881cbf0ab49ced3065 Mon Sep 17 00:00:00 2001
From: szeyu <szeyusim@gmail.com>
Date: Mon, 2 Sep 2024 15:22:40 +0800
Subject: [PATCH 20/21] update the benchmark loop to loop without having the
 model load again in every loop

---
 benchmark/README.md              |  3 +-
 benchmark/ellm_benchmark.py      | 70 ++++++++++++++++++++------------
 benchmark/loop_ellm_benchmark.py |  6 ++-
 3 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 710b602..f09ffc3 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -10,9 +10,10 @@ To benchmark a model, run this
 * --token_out `Number of Output Tokens`
 * --input_token_bias `Adjust the input token`
 * --output_token_bias `Adjust the output token`
+* --loop_count `Adjust the loop count`
 
 ```shell
-python ellm_benchmark.py --backend <cpu | ipex | openvino | directml> --model_name <Name of the Model> --model_path <Path to Model | Model Repo ID> --token_in <Number of Input Tokens (Max 2048)> --token_out <Number of Output Tokens> --input_token_bias <int value> --output_token_bias <int value>
+python ellm_benchmark.py --backend <cpu | ipex | openvino | directml> --model_name <Name of the Model> --model_path <Path to Model | Model Repo ID> --token_in <Number of Input Tokens (Max 2048)> --token_out <Number of Output Tokens> --input_token_bias <int value> --output_token_bias <int value> --loop_count <int value>
 ```
 
 
diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
index 7771768..12a2822 100644
--- a/benchmark/ellm_benchmark.py
+++ b/benchmark/ellm_benchmark.py
@@ -12,28 +12,8 @@
 from embeddedllm import engine
 from embeddedllm import sampling_params
 
-async def benchmark(input_token_length, output_token_length, model_path, model_name, backend, input_token_bias=0, output_token_bias=0):
-    # Create the profile_model_timing directory if it doesn't exist
-    log_dir = "profile_model_timing"
-    os.makedirs(log_dir, exist_ok=True)
-
-    log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log')
-
-    # Add the log file to the logger (it will append if the file already exists)
-    logger.add(log_file, mode='a')
-
-    # need different parameter for cpu and directml
-    if backend == "cpu":
-        device="cpu"
-    elif backend == "ipex":
-        device="xpu"
-    elif backend == "openvino":
-        device="gpu"
-    elif backend == "directml":
-        device = ""
-
-    model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend)
-
+async def benchmark(model, input_token_length, output_token_length, model_name, input_token_bias=0, output_token_bias=0):
+    
     logger.info(f"Model: {model_name}")
 
     model.tokenizer.chat_template = "{% for message in messages %}{{  message['content']}}{% endfor %}"  # Override
@@ -89,28 +69,64 @@ async def generate():
     average_tps = (input_token_length + output_token_length) / total_time_taken
     logger.info("Average tps: "+ str(average_tps))
 
-    # Remove the logger to close the log file
-    logger.remove()
+    
 
 def main():
     parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.")
-    parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, ipex, openvino or directml)')
+    parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'npu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, npu, ipex, openvino or directml)')
     parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
     parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id')
     parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)')
     parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens')
     parser.add_argument('--input_token_bias', type=int, required=False, help='Adjust the input token length')
     parser.add_argument('--output_token_bias', type=int, required=False, help='Adjust the output token length')
+    parser.add_argument('--loop_count', type=int, required=False, help='Adjust the loop count')
 
     args = parser.parse_args()
 
+    backend = args.backend
+    model_path = args.model_path
+    model_name = args.model_name
+    token_in = args.token_in
+    token_out = args.token_out
+    input_token_bias = args.input_token_bias
+    output_token_bias = args.output_token_bias
+    loop_count = args.loop_count
+
     # Cap the input tokens to 2048
     if args.token_in > 2048:
         print("Input tokens capped to 2048.")
         args.token_in = 2048
 
-    # Run the async function using asyncio.run()
-    asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend, args.input_token_bias, args.output_token_bias))
+    # Create the profile_model_timing directory if it doesn't exist
+    log_dir = "profile_model_timing"
+    os.makedirs(log_dir, exist_ok=True)
+
+    log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{token_in}_{token_out}.log')
+
+    # Add the log file to the logger
+    logger.add(log_file, mode='w')
+
+    # need different parameter for cpu and directml
+    if backend == "cpu":
+        device="cpu"
+    elif backend == "npu":
+        device="npu"
+    elif backend == "ipex":
+        device="xpu"
+    elif backend == "openvino":
+        device="gpu"
+    elif backend == "directml":
+        device = ""
+
+    model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend)
+
+    for _ in range(loop_count):
+        # Run the async function using asyncio.run()
+        asyncio.run(benchmark(model, token_in, token_out, model_name, input_token_bias, output_token_bias))
+
+    # Remove the logger to close the log file
+    logger.remove()
 
 if __name__ == "__main__":
     main()
diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
index 658326e..d173e8a 100644
--- a/benchmark/loop_ellm_benchmark.py
+++ b/benchmark/loop_ellm_benchmark.py
@@ -37,9 +37,10 @@
 # backend = "directml"
 # backend = "ipex"
 # backend = "openvino"
+# backend = "npu"
 
 # Number of loops
-loop_count = 20
+loop_count = 3
 
 # input and output token bias
 input_token_bias = 0
@@ -60,7 +61,8 @@
                 "--token_in", str(input_token_length),
                 "--token_out", str(output_token_length),
                 "--input_token_bias", str(input_token_bias),
-                "--output_token_bias", str(output_token_bias)
+                "--output_token_bias", str(output_token_bias),
+                "--loop_count", str(loop_count)
             ]
 
             # Execute the command

From 769e558735b4e4d371a861dbd1c8d3c5ddc7bcc0 Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Mon, 2 Sep 2024 15:39:31 +0800
Subject: [PATCH 21/21] Update loop_ellm_benchmark.py

---
 benchmark/loop_ellm_benchmark.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py
index d173e8a..f78c50f 100644
--- a/benchmark/loop_ellm_benchmark.py
+++ b/benchmark/loop_ellm_benchmark.py
@@ -51,19 +51,18 @@
 
 for model_name, model_path in zip(model_names, model_paths):
     for input_token_length, output_token_length in token_in_out:
-        for i in range(loop_count):
-            # Construct the command
-            command = [
-                "python", ellm_benchmark_script,
-                "--backend", backend,
-                "--model_name", model_name,
-                "--model_path", model_path,
-                "--token_in", str(input_token_length),
-                "--token_out", str(output_token_length),
-                "--input_token_bias", str(input_token_bias),
-                "--output_token_bias", str(output_token_bias),
-                "--loop_count", str(loop_count)
-            ]
+        # Construct the command
+        command = [
+            "python", ellm_benchmark_script,
+            "--backend", backend,
+            "--model_name", model_name,
+            "--model_path", model_path,
+            "--token_in", str(input_token_length),
+            "--token_out", str(output_token_length),
+            "--input_token_bias", str(input_token_bias),
+            "--output_token_bias", str(output_token_bias),
+            "--loop_count", str(loop_count)
+        ]
 
-            # Execute the command
-            subprocess.run(command)
+        # Execute the command
+        subprocess.run(command)