From 50f4c3449c2c3d2c69a406be09f26ee1b100a5e0 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Tue, 6 Aug 2024 16:15:20 +0800 Subject: [PATCH 01/21] update new model list with new reuploaded model and ipex option in modelui --- src/embeddedllm/entrypoints/modelui.py | 239 +++++++++++++++++-------- 1 file changed, 169 insertions(+), 70 deletions(-) diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index ca1da44..e6fd1b0 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -64,45 +64,103 @@ class ModelCard(BaseModel): context_length: int size: Optional[int] = 0 +ipex_model_dict_list = { + "microsoft/Phi-3-mini-4k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/", + repo_id="microsoft/Phi-3-mini-4k-instruct", + model_name="Phi-3-mini-4k-instruct", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "microsoft/Phi-3-mini-128k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main", + repo_id="microsoft/Phi-3-mini-128k-instruct", + model_name="Phi-3-mini-128k-instruct", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "microsoft/Phi-3-medium-4k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main", + repo_id="microsoft/Phi-3-medium-4k-instruct", + model_name="Phi-3-medium-4k-instruct", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "microsoft/Phi-3-medium-128k-instruct": ModelCard( + hf_url="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/tree/main", + repo_id="microsoft/Phi-3-medium-128k-instruct", + model_name="Phi-3-medium-128k-instruct", + subfolder=".", + repo_type="model", + context_length=4096, + ), +} dml_model_dict_list = { - "microsoft/Phi-3-mini-4k-instruct": ModelCard( - hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/directml/directml-int4-awq-block-128", - repo_id="microsoft/Phi-3-mini-4k-instruct-onnx", - model_name="Phi-3-mini-4k-instruct-onnx", - subfolder="directml/directml-int4-awq-block-128", + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml", + model_name="Phi-3-mini-4k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml", + model_name="Phi-3-mini-128k-instruct-onnx-directml", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4", - repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx", - model_name="Phi-3-mini-4k-instruct-062024-onnx", - subfolder="onnx/directml/Phi-3-mini-4k-instruct-062024-int4", + "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml", + model_name="Phi-3-medium-4k-instruct-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml", + model_name="Phi-3-medium-128k-instruct-onnx-directml", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4", + "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml", + model_name="Phi-3-mini-4k-instruct-062024-int4-onnx-directml", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main", repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-onnx", - subfolder="onnx/directml/mistralai_Mistral-7B-Instruct-v0.3-int4", + model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/gemma-2b-it-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx/tree/main/onnx/directml/gemma-2b-it-int4", - repo_id="EmbeddedLLM/gemma-2b-it-onnx", - model_name="gemma-2b-it-int4", - subfolder="onnx/directml/gemma-2b-it-int4", + "EmbeddedLLM/gemma-2b-it-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/gemma-2b-it-int4-onnx-directml", + model_name="gemma-2b-it-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/gemma-7b-it-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-onnx/tree/main/onnx/directml/gemma-7b-it-int4", - repo_id="EmbeddedLLM/gemma-7b-it-onnx", - model_name="gemma-7b-it-int4", - subfolder="onnx/directml/gemma-7b-it-int4", + "EmbeddedLLM/gemma-7b-it-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/gemma-7b-it-int4-onnx-directml", + model_name="gemma-7b-it-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), @@ -114,70 +172,94 @@ class ModelCard(BaseModel): repo_type="model", context_length=4096, ), - "EmbeddedLLM/Starling-LM-7b-beta-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-onnx/tree/main/onnx/directml/Starling-LM-7b-beta-int4", - repo_id="EmbeddedLLM/Starling-LM-7b-beta-onnx", - model_name="Starling-LM-7b-beta-int4", - subfolder="onnx/directml/Starling-LM-7b-beta-int4", + "EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml", + model_name="Starling-LM-7b-beta-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/directml/openchat-3.6-8b-20240522-int4", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-int4", - subfolder="onnx/directml/openchat-3.6-8b-20240522-int4", + "EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml", + model_name="openchat-3.6-8b-20240522-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx/tree/main/onnx/directml/01-ai_Yi-1.5-6B-Chat-int4", - repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx", - model_name="01-ai_Yi-1.5-6B-Chat-int4", - subfolder="onnx/directml/01-ai_Yi-1.5-6B-Chat-int4", + "EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml/tree/main", + repo_id="EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml", + model_name="01-ai_Yi-1.5-6B-Chat-int4-onnx-directml", + subfolder=".", repo_type="model", context_length=4096, ), } cpu_model_dict_list = { - "microsoft/Phi-3-mini-4k-instruct": ModelCard( - hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx/tree/main/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", - repo_id="microsoft/Phi-3-mini-4k-instruct-onnx", - model_name="Phi-3-mini-4k-instruct-onnx", - subfolder="cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32", + model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=4096, ), - "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", - subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32", + model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx/tree/main/onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", - model_name="mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", - subfolder="onnx/cpu_and_mobile/mistral-7b-instruct-v0.3-cpu-int4-rtn-block-32", + "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32", + model_name="mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=32768, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32-acc-level-4", - subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4", + "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4", + model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4", + subfolder=".", repo_type="model", context_length=8192, ), - "EmbeddedLLM/openchat-3.6-8b-20240522-cpu-int4-rtn-block-32": ModelCard( - hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32", - repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx", - model_name="openchat-3.6-8b-20240522-cpu-int4-rtn-block-32", - subfolder="onnx/cpu_and_mobile/cpu-int4-rtn-block-32", + "EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32": ModelCard( + hf_url="https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main", + repo_id="EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32", + model_name="openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32", + subfolder=".", repo_type="model", context_length=8192, ), @@ -231,8 +313,13 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"): repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type ) +for k, v in ipex_model_dict_list.items(): + v.size = compute_memory_size( + repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type + ) + -def convert_to_dataframe(dml_model_dict_list): +def convert_to_dataframe(model_dict_list): # Create lists to store the data model_names = [] hf_urls = [] @@ -244,7 +331,7 @@ def convert_to_dataframe(dml_model_dict_list): context_lengths = [] # Iterate through the dictionary and extract the data - for key, model_card in dml_model_dict_list.items(): + for key, model_card in model_dict_list.items(): model_names.append(key) hf_urls.append(model_card.hf_url) repo_ids.append(model_card.repo_id) @@ -318,6 +405,9 @@ def update_model_list(engine_type): if engine_type == "DirectML": models = sorted(list(dml_model_dict_list.keys())) models_pandas = convert_to_dataframe(dml_model_dict_list) + elif engine_type == "Ipex": + models = sorted(list(ipex_model_dict_list.keys())) + models_pandas = convert_to_dataframe(ipex_model_dict_list) else: models = sorted(list(cpu_model_dict_list.keys())) models_pandas = convert_to_dataframe(cpu_model_dict_list) @@ -340,12 +430,14 @@ def deploy_model(engine_type, model_name, port_number): if engine_type == "DirectML": llm_model_card = dml_model_dict_list[model_name] + elif engine_type == "Ipex": + llm_model_card = ipex_model_dict_list[model_name] else: llm_model_card = cpu_model_dict_list[model_name] snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*", + allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None, repo_type="model", ) @@ -402,6 +494,8 @@ def download_model(engine_type, model_name): if engine_type == "DirectML": llm_model_card = dml_model_dict_list[model_name] + elif engine_type == "Ipex": + llm_model_card = ipex_model_dict_list[model_name] else: llm_model_card = cpu_model_dict_list[model_name] @@ -412,7 +506,7 @@ def download_model(engine_type, model_name): yield "Downloading ..." snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*", + allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None, repo_type="model", ) yield snapshot_path @@ -443,9 +537,14 @@ def main(): with gr.Accordion("See More Model Details", open=False): model_info_pandas_frame = gr.Dataframe(value=None) + default_value = "CPU" # Default value + if backend == "directml": + default_value = "DirectML" + elif backend == "ipex": + default_value = "Ipex" selected_engine_type = gr.Dropdown( - choices=["DirectML", "CPU"], - value="DirectML" if backend == "directml" else "CPU", + choices=["DirectML", "Ipex", "CPU"], + value = default_value, multiselect=False, label="LLM Engine", show_label=True, From ec2f421cab254685bb0c90ba89424bc475a078fd Mon Sep 17 00:00:00 2001 From: szeyusim Date: Tue, 6 Aug 2024 16:49:01 +0800 Subject: [PATCH 02/21] fix the typo of mistral repo id --- src/embeddedllm/entrypoints/modelui.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index e6fd1b0..212c67b 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -142,7 +142,7 @@ class ModelCard(BaseModel): ), "EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml": ModelCard( hf_url="https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml/tree/main", - repo_id="EmbeddedLLM/mistral-7b-instruct-v0.3-onnx", + repo_id="EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml", model_name="mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml", subfolder=".", repo_type="model", From dbdefa04032d8cb1e473130bb00ece8757a68c80 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Tue, 6 Aug 2024 17:45:54 +0800 Subject: [PATCH 03/21] edit to the latest version of models available --- README.md | 5 +++-- docs/model/onnxruntime_cpu_models.md | 14 ++++++++++++++ docs/model/onnxruntime_directml_models.md | 19 +++++++++++++++++++ docs/model/onnxruntime_models.md | 19 ------------------- 4 files changed, 36 insertions(+), 21 deletions(-) create mode 100644 docs/model/onnxruntime_cpu_models.md create mode 100644 docs/model/onnxruntime_directml_models.md delete mode 100644 docs/model/onnxruntime_models.md diff --git a/README.md b/README.md index 99542ed..2fcfed5 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E ## Table Content - [Supported Models](#supported-models-quick-start) - - [Onnxruntime Models](./docs/model/onnxruntime_models.md) + - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md) + - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md) - [Ipex-LLM Models](./docs/model/ipex_models.md) - [Getting Started](#getting-started) - [Installation From Source](#installation) @@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) | | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) | -| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | +| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) | | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) | | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) | diff --git a/docs/model/onnxruntime_cpu_models.md b/docs/model/onnxruntime_cpu_models.md new file mode 100644 index 0000000..6951ac8 --- /dev/null +++ b/docs/model/onnxruntime_cpu_models.md @@ -0,0 +1,14 @@ +# Model Powered by Onnxruntime CPU GenAI + +## Supported Models + +| Model Name | Parameters | Context Length | Size (GB) | Link | +|-------------------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32 | 3.8B | 4096 | 2.538 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) | +| Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B | 4096 | 2.538 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32 | 3.8B | 4096 | 2.585 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32/tree/main) | +| Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4 | 3.8B | 4096 | 2.585 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32 | 7B | 32768 | 4.66 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32/tree/main) | +| mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4 | 7B | 32768 | 4.66 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | +| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32 | 8B | 8192 | 6.339 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32/tree/main) | +| openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4 | 8B | 8192 | 6.339 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main) | diff --git a/docs/model/onnxruntime_directml_models.md b/docs/model/onnxruntime_directml_models.md new file mode 100644 index 0000000..0f6a3a3 --- /dev/null +++ b/docs/model/onnxruntime_directml_models.md @@ -0,0 +1,19 @@ +# Model Powered by Onnxruntime DirectML GenAI + +## Supported Models + +| Model Name | Parameters | Context Length | Size (GB) | Link | +|--------------------------------------------|------------|----------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| Phi-3-mini-4k-instruct-onnx-directml | 3.8B | 4096 | 1.989 | [EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml) | +| Phi-3-mini-128k-instruct-onnx-directml | 3.8B | 131072 | 2.018 | [EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-directml) | +| Phi-3-medium-4k-instruct-onnx-directml | 17B | 4096 | 6.987 | [EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml) | +| Phi-3-medium-128k-instruct-onnx-directml | 17B | 131072 | 7.025 | [EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-medium-128k-instruct-onnx-directml) | +| Phi-3-mini-4k-instruct-062024-int4-onnx-directml | 3.8B | 4096 | 2.137 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml) | +| mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml | 7B | 32768 | 3.988 | [EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/mistralai_Mistral-7B-Instruct-v0.3-int4-onnx-directml) | +| gemma-2b-it-int4-onnx-directml | 2B | 8192 | 2.314 | [EmbeddedLLM/gemma-2b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-2b-it-int4-onnx-directml) | +| gemma-7b-it-int4-onnx-directml | 7B | 8192 | 5.958 | [EmbeddedLLM/gemma-7b-it-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/gemma-7b-it-int4-onnx-directml) | +| llama-2-7b-chat-int4-onnx-directml | 7B | 4096 | 3.708 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | +| Starling-LM-7b-beta-int4-onnx-directml | 7B | 8192 | 3.974 | [EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/Starling-LM-7b-beta-int4-onnx-directml) | +| openchat-3.6-8b-20240522-int4-onnx-directml | 8B | 8192 | 4.922 | [EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-int4-onnx-directml) | +| Yi-1.5-6B-Chat-int4-onnx-directml | 6B | 32768 | 3.532 | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-int4-onnx-directml) | + diff --git a/docs/model/onnxruntime_models.md b/docs/model/onnxruntime_models.md deleted file mode 100644 index 4d61ffe..0000000 --- a/docs/model/onnxruntime_models.md +++ /dev/null @@ -1,19 +0,0 @@ -# Model Powered by Onnxruntime GenAI - -## Supported Models - -| Models | Parameters | Context Length | Link | -| --- | --- | --- | --- | -| Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) | -| Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) | -| Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) | -| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | -| Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) | -| Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) | -| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) | -| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) | -| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) | -| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) | -| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) | -| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) | -| Phi-3-vision-128k-instruct | | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) | From 0965d51145a0b62cac1fe7d36300b2ac8ff9a038 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Tue, 6 Aug 2024 17:46:16 +0800 Subject: [PATCH 04/21] change the context length of 128k to 131072 --- src/embeddedllm/entrypoints/modelui.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index 212c67b..1bbf9de 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -79,7 +79,7 @@ class ModelCard(BaseModel): model_name="Phi-3-mini-128k-instruct", subfolder=".", repo_type="model", - context_length=4096, + context_length=131072, ), "microsoft/Phi-3-medium-4k-instruct": ModelCard( hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main", @@ -95,7 +95,7 @@ class ModelCard(BaseModel): model_name="Phi-3-medium-128k-instruct", subfolder=".", repo_type="model", - context_length=4096, + context_length=131072, ), } @@ -114,7 +114,7 @@ class ModelCard(BaseModel): model_name="Phi-3-mini-128k-instruct-onnx-directml", subfolder=".", repo_type="model", - context_length=4096, + context_length=131072, ), "EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml": ModelCard( hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-medium-4k-instruct-onnx-directml/tree/main", @@ -130,7 +130,7 @@ class ModelCard(BaseModel): model_name="Phi-3-medium-128k-instruct-onnx-directml", subfolder=".", repo_type="model", - context_length=4096, + context_length=131072, ), "EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml": ModelCard( hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-int4-onnx-directml/tree/main", @@ -221,7 +221,7 @@ class ModelCard(BaseModel): model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32", subfolder=".", repo_type="model", - context_length=4096, + context_length=131072, ), "EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( hf_url="https://huggingface.co/EmbeddedLLM/Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", @@ -229,7 +229,7 @@ class ModelCard(BaseModel): model_name="Phi-3-mini-128k-instruct-onnx-cpu-int4-rtn-block-32-acc-level-4", subfolder=".", repo_type="model", - context_length=4096, + context_length=131072, ), "EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4": ModelCard( hf_url="https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx-cpu-int4-rtn-block-32-acc-level-4/tree/main", From 5dbf495e8f996b8322479e86f2fbe13ee136cd6a Mon Sep 17 00:00:00 2001 From: szeyusim Date: Tue, 13 Aug 2024 15:53:25 +0800 Subject: [PATCH 05/21] onnx auto download model if repo id is provided as model path --- src/embeddedllm/backend/onnxruntime_engine.py | 11 ++++++++ src/embeddedllm/entrypoints/modelui.py | 25 ++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/embeddedllm/backend/onnxruntime_engine.py b/src/embeddedllm/backend/onnxruntime_engine.py index 82b5dca..95d13c3 100644 --- a/src/embeddedllm/backend/onnxruntime_engine.py +++ b/src/embeddedllm/backend/onnxruntime_engine.py @@ -1,9 +1,11 @@ # from embeddedllm.transformers_utils.image_processing_phi3v import Phi3VImageProcessor import contextlib import time +import os from pathlib import Path from tempfile import TemporaryDirectory from typing import AsyncIterator, List, Optional +from huggingface_hub import snapshot_download import onnxruntime_genai as og from loguru import logger @@ -39,6 +41,15 @@ def onnx_generator_context(model, params): class OnnxruntimeEngine(BaseLLMEngine): def __init__(self, model_path: str, vision: bool, device: str = "cpu"): self.model_path = model_path + + if not os.path.exists(model_path): + snapshot_path = snapshot_download( + repo_id=model_path, + allow_patterns=None, + repo_type="model", + ) + model_path = snapshot_path + self.model_config = AutoConfig.from_pretrained(self.model_path, trust_remote_code=True) self.device = device diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index 1bbf9de..66fdd20 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -441,32 +441,49 @@ def deploy_model(engine_type, model_name, port_number): repo_type="model", ) - model_path = os.path.join(snapshot_path, llm_model_card.subfolder) + if llm_model_card.subfolder != ".": + model_path = os.path.join(snapshot_path, llm_model_card.subfolder) + else: + model_path = snapshot_path + + print("Model path:",model_path) + if engine_type == 'Ipex': + device = 'xpu' + + else: + device = 'cpu' + deployed_model.process = subprocess.Popen( [ "ellm_server", "--model_path", model_path, + "--backend", + backend, + "--device", + device, "--port", f"{port_number}", - "--served_model_name", - model_name, + # "--served_model_name", + # model_name ] ) + deployed_model.model_name = model_name while True: # ping the server to see if it is up. if check_health(f"http://localhost:{port_number}/health"): break - + deployment_message = f"""

Deployment Status:

Model: {model_name}

Engine: {engine_type}

Port: {port_number}

+

Model Path: {model_path}

""" From fb2c63ebad4b4a71c8a7eedcda2a3f0cb45e5e31 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Tue, 13 Aug 2024 17:47:03 +0800 Subject: [PATCH 06/21] formated with black --- src/embeddedllm/entrypoints/modelui.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index 66fdd20..3a62e04 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -64,6 +64,7 @@ class ModelCard(BaseModel): context_length: int size: Optional[int] = 0 + ipex_model_dict_list = { "microsoft/Phi-3-mini-4k-instruct": ModelCard( hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/", @@ -437,7 +438,9 @@ def deploy_model(engine_type, model_name, port_number): snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None, + allow_patterns=( + f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None + ), repo_type="model", ) @@ -446,14 +449,14 @@ def deploy_model(engine_type, model_name, port_number): else: model_path = snapshot_path - print("Model path:",model_path) + print("Model path:", model_path) + + if engine_type == "Ipex": + device = "xpu" - if engine_type == 'Ipex': - device = 'xpu' - else: - device = 'cpu' - + device = "cpu" + deployed_model.process = subprocess.Popen( [ "ellm_server", @@ -476,7 +479,7 @@ def deploy_model(engine_type, model_name, port_number): # ping the server to see if it is up. if check_health(f"http://localhost:{port_number}/health"): break - + deployment_message = f"""

Deployment Status:

@@ -523,7 +526,9 @@ def download_model(engine_type, model_name): yield "Downloading ..." snapshot_path = snapshot_download( repo_id=llm_model_card.repo_id, - allow_patterns=f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None, + allow_patterns=( + f"{llm_model_card.subfolder}/*" if llm_model_card.subfolder != "." else None + ), repo_type="model", ) yield snapshot_path @@ -561,7 +566,7 @@ def main(): default_value = "Ipex" selected_engine_type = gr.Dropdown( choices=["DirectML", "Ipex", "CPU"], - value = default_value, + value=default_value, multiselect=False, label="LLM Engine", show_label=True, From f8c8f27d4eb410ddfbd1886e3d916ab07adc27e3 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Wed, 14 Aug 2024 11:13:19 +0800 Subject: [PATCH 07/21] fixed with flake8 --- src/embeddedllm/inputs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/embeddedllm/inputs.py b/src/embeddedllm/inputs.py index 9797d05..8f05498 100644 --- a/src/embeddedllm/inputs.py +++ b/src/embeddedllm/inputs.py @@ -23,13 +23,13 @@ class ImagePixelData(TypedDict): # https://github.com/vllm-project/vllm/pull/4028 @overload -def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: - ... +def parse_and_batch_prompt(prompt: Union[str, List[str]]) -> Sequence[ParsedText]: ... @overload -def parse_and_batch_prompt(prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]: - ... +def parse_and_batch_prompt( + prompt: Union[List[int], List[List[int]]] +) -> Sequence[ParsedTokens]: ... def parse_and_batch_prompt( From d54b4d8d673717388c2d97e3d62dd360220371c2 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Wed, 14 Aug 2024 11:13:45 +0800 Subject: [PATCH 08/21] add openvino description and the device gpu --- src/embeddedllm/engine.py | 2 +- src/embeddedllm/entrypoints/api_server.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py index 3eac11c..86f589c 100644 --- a/src/embeddedllm/engine.py +++ b/src/embeddedllm/engine.py @@ -80,7 +80,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend: else: raise ValueError( - f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda` and `directml`." + f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`." ) self.tokenizer = self.engine.tokenizer diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py index 9385f24..efc2916 100644 --- a/src/embeddedllm/entrypoints/api_server.py +++ b/src/embeddedllm/entrypoints/api_server.py @@ -28,9 +28,9 @@ class Config(BaseSettings): ) port: int = Field(default=6979, description="Server port.") host: str = Field(default="0.0.0.0", description="Server host.") - device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`") + device: str = Field(default="cpu", description="Device type: `cpu`, `xpu`, `gpu`") backend: str = Field( - default="directml", description="Backend engine: `cpu`, `ipex` and `directml`" + default="directml", description="Backend engine: `cpu`, `ipex`, `openvino` and `directml`" ) response_role: str = Field(default="assistant", description="Server response role.") uvicorn_log_level: str = Field( From 1c3b393ba10aac9af7745b0e45ecd3498e86cb61 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Wed, 14 Aug 2024 14:48:53 +0800 Subject: [PATCH 09/21] update openvino in modelui list --- src/embeddedllm/entrypoints/modelui.py | 88 +++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py index 3a62e04..cc1e15c 100644 --- a/src/embeddedllm/entrypoints/modelui.py +++ b/src/embeddedllm/entrypoints/modelui.py @@ -20,7 +20,7 @@ def get_embeddedllm_backend(): version = importlib.metadata.version("embeddedllm") # Use regex to extract the backend - match = re.search(r"\+(directml|cpu|cuda|ipex)$", version) + match = re.search(r"\+(directml|cpu|cuda|ipex|openvino)$", version) if match: backend = match.group(1) @@ -65,6 +65,73 @@ class ModelCard(BaseModel): size: Optional[int] = 0 +openvino_model_dict_list = { + # "OpenVINO/Phi-3-mini-128k-instruct-int4-ov": ModelCard( + # hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int4-ov/tree/main/", + # repo_id="OpenVINO/Phi-3-mini-128k-instruct-int4-ov", + # model_name="Phi-3-mini-128k-instruct-int4-ov", + # subfolder=".", + # repo_type="model", + # context_length=131072, + # ), + "OpenVINO/Phi-3-mini-128k-instruct-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-128k-instruct-int8-ov/tree/main/", + repo_id="OpenVINO/Phi-3-mini-128k-instruct-int8-ov", + model_name="Phi-3-mini-128k-instruct-int8-ov", + subfolder=".", + repo_type="model", + context_length=131072, + ), + # "OpenVINO/Phi-3-mini-4k-instruct-int4-ov": ModelCard( + # hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov/tree/main/", + # repo_id="OpenVINO/Phi-3-mini-4k-instruct-int4-ov", + # model_name="Phi-3-mini-4k-instruct-int4-ov", + # subfolder=".", + # repo_type="model", + # context_length=4096, + # ), + "OpenVINO/Phi-3-mini-4k-instruct-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int8-ov/tree/main/", + repo_id="OpenVINO/Phi-3-mini-4k-instruct-int8-ov", + model_name="Phi-3-mini-4k-instruct-int8-ov", + subfolder=".", + repo_type="model", + context_length=4096, + ), + # "OpenVINO/Phi-3-medium-4k-instruct-int4-ov": ModelCard( + # hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int4-ov/tree/main/", + # repo_id="OpenVINO/Phi-3-medium-4k-instruct-int4-ov", + # model_name="Phi-3-medium-4k-instruct-int4-ov", + # subfolder=".", + # repo_type="model", + # context_length=4096, + # ), + "OpenVINO/Phi-3-medium-4k-instruct-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/Phi-3-medium-4k-instruct-int8-ov/tree/main/", + repo_id="OpenVINO/Phi-3-medium-4k-instruct-int8-ov", + model_name="Phi-3-medium-4k-instruct-int8-ov", + subfolder=".", + repo_type="model", + context_length=4096, + ), + "OpenVINO/open_llama_7b_v2-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/open_llama_7b_v2-int8-ov/tree/main/", + repo_id="OpenVINO/open_llama_7b_v2-int8-ov", + model_name="open_llama_7b_v2-int8-ov", + subfolder=".", + repo_type="model", + context_length=2048, + ), + "OpenVINO/open_llama_3b_v2-int8-ov": ModelCard( + hf_url="https://huggingface.co/OpenVINO/open_llama_3b_v2-int8-ov/tree/main/", + repo_id="OpenVINO/open_llama_3b_v2-int8-ov", + model_name="open_llama_3b_v2-int8-ov", + subfolder=".", + repo_type="model", + context_length=2048, + ), +} + ipex_model_dict_list = { "microsoft/Phi-3-mini-4k-instruct": ModelCard( hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/", @@ -319,6 +386,11 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"): repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type ) +for k, v in openvino_model_dict_list.items(): + v.size = compute_memory_size( + repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type + ) + def convert_to_dataframe(model_dict_list): # Create lists to store the data @@ -409,6 +481,9 @@ def update_model_list(engine_type): elif engine_type == "Ipex": models = sorted(list(ipex_model_dict_list.keys())) models_pandas = convert_to_dataframe(ipex_model_dict_list) + elif engine_type == 'OpenVino': + models = sorted(list(openvino_model_dict_list.keys())) + models_pandas = convert_to_dataframe(openvino_model_dict_list) else: models = sorted(list(cpu_model_dict_list.keys())) models_pandas = convert_to_dataframe(cpu_model_dict_list) @@ -433,6 +508,8 @@ def deploy_model(engine_type, model_name, port_number): llm_model_card = dml_model_dict_list[model_name] elif engine_type == "Ipex": llm_model_card = ipex_model_dict_list[model_name] + elif engine_type == "OpenVino": + llm_model_card = openvino_model_dict_list[model_name] else: llm_model_card = cpu_model_dict_list[model_name] @@ -453,7 +530,8 @@ def deploy_model(engine_type, model_name, port_number): if engine_type == "Ipex": device = "xpu" - + elif engine_type == "OpenVino": + device = "gpu" else: device = "cpu" @@ -516,6 +594,8 @@ def download_model(engine_type, model_name): llm_model_card = dml_model_dict_list[model_name] elif engine_type == "Ipex": llm_model_card = ipex_model_dict_list[model_name] + elif engine_type == "OpenVino": + llm_model_card = openvino_model_dict_list[model_name] else: llm_model_card = cpu_model_dict_list[model_name] @@ -564,8 +644,10 @@ def main(): default_value = "DirectML" elif backend == "ipex": default_value = "Ipex" + elif backend == "openvino": + default_value = "OpenVino" selected_engine_type = gr.Dropdown( - choices=["DirectML", "Ipex", "CPU"], + choices=["DirectML", "Ipex", "OpenVino", "CPU"], value=default_value, multiselect=False, label="LLM Engine", From 75eff7cf6c1155c5362f0c7a29d7eff04ce43709 Mon Sep 17 00:00:00 2001 From: szeyu Date: Thu, 15 Aug 2024 14:43:44 +0800 Subject: [PATCH 10/21] first commit of benchmark code --- benchmark/analyse_detailed_benchmark.py | 124 +++++++++++++++++++ benchmark/ellm_benchmark.py | 117 +++++++++++++++++ benchmark/loop_analyse_detailed_benchmark.py | 20 +++ benchmark/loop_ellm_benchmark.py | 57 +++++++++ benchmark/sampleText.txt | 91 ++++++++++++++ 5 files changed, 409 insertions(+) create mode 100644 benchmark/analyse_detailed_benchmark.py create mode 100644 benchmark/ellm_benchmark.py create mode 100644 benchmark/loop_analyse_detailed_benchmark.py create mode 100644 benchmark/loop_ellm_benchmark.py create mode 100644 benchmark/sampleText.txt diff --git a/benchmark/analyse_detailed_benchmark.py b/benchmark/analyse_detailed_benchmark.py new file mode 100644 index 0000000..ca45d30 --- /dev/null +++ b/benchmark/analyse_detailed_benchmark.py @@ -0,0 +1,124 @@ +import os +import re +import numpy as np +import pandas as pd +import argparse + +def extract_data_from_log(log_file): + average_tps_list = [] + prompt_tokens_per_second_list = [] + new_tokens_per_second_list = [] + error_count = 0 + error_state = False + + if not os.path.exists(log_file): + print(f"Log file does not exist: {log_file}") + return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count + + with open(log_file, 'r') as file: + for line in file: + if "ERROR" in line: + error_count += 1 + error_state = True + continue + + if "Average tps" in line and error_state == True: + error_state = False + continue + + if "Average tps" in line: + average_tps = float(re.search(r"Average tps: ([\d.]+)", line).group(1)) + average_tps_list.append(average_tps) + continue + + if "Prompt tokens per second" in line: + prompt_tokens_per_second = float(re.search(r"Prompt tokens per second: ([\d.]+)", line).group(1)) + prompt_tokens_per_second_list.append(prompt_tokens_per_second) + if "New tokens per second" in line: + new_tokens_per_second = float(re.search(r"New tokens per second: ([\d.]+)", line).group(1)) + new_tokens_per_second_list.append(new_tokens_per_second) + + return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count + +def calculate_statistics(data): + data_np = np.array(data) + stats = { + "std": np.std(data_np, ddof=1), # Sample standard deviation + "mean": np.mean(data_np), + "min": np.min(data_np), + "1%": np.percentile(data_np, 1), + "25%": np.percentile(data_np, 25), + "50%": np.percentile(data_np, 50), # Median + "75%": np.percentile(data_np, 75), + "99%": np.percentile(data_np, 99), + "max": np.max(data_np) + } + return stats + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Process log files and generate statistics.") + parser.add_argument('--model_name', type=str, required=True, help='Name of the model') + return parser.parse_args() + +def main(model_name): + token_ins = [128, 256, 512, 1024] + token_outs = [128, 256, 512, 1024] + + statistics = [] + + # Create the profile_model_timing directory if it doesn't exist + log_dir = "profile_model_timing" + os.makedirs(log_dir, exist_ok=True) + + for input_token_length in token_ins: + for output_token_length in token_outs: + log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log') + average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count = extract_data_from_log(log_file) + + if not average_tps_list and not prompt_tokens_per_second_list and not new_tokens_per_second_list: + # Log file does not exist or is empty, append "-" for each statistical value + statistics.append([ + model_name, input_token_length, output_token_length, + "-", "-", "-", "-", "-", "-", "-", "-", "-", + "-", "-", "-", "-", "-", "-", "-", "-", "-", + "-", "-", "-", "-", "-", "-", "-", "-", "-", + error_count + ]) + else: + min_len = min(len(average_tps_list), len(prompt_tokens_per_second_list), len(new_tokens_per_second_list)) + + if min_len > 0: + prompt_stats = calculate_statistics(prompt_tokens_per_second_list[5:min_len]) + new_token_stats = calculate_statistics(new_tokens_per_second_list[5:min_len]) + average_tps_stats = calculate_statistics(average_tps_list[5:min_len]) + + statistics.append([ + model_name, input_token_length, output_token_length, + prompt_stats["std"], prompt_stats["mean"], prompt_stats["min"], prompt_stats["1%"], prompt_stats["25%"], prompt_stats["50%"], prompt_stats["75%"], prompt_stats["99%"], prompt_stats["max"], + new_token_stats["std"], new_token_stats["mean"], new_token_stats["min"], new_token_stats["1%"], new_token_stats["25%"], new_token_stats["50%"], new_token_stats["75%"], new_token_stats["99%"], new_token_stats["max"], + average_tps_stats["std"], average_tps_stats["mean"], average_tps_stats["min"], average_tps_stats["1%"], average_tps_stats["25%"], average_tps_stats["50%"], average_tps_stats["75%"], average_tps_stats["99%"], average_tps_stats["max"], + error_count + ]) + + # Create a DataFrame + columns = [ + "Model", "Token In", "Token Out", + "Token In / sec std", "Token In / sec mean", "Token In / sec min", "Token In / sec 1%", "Token In / sec 25%", "Token In / sec 50%", "Token In / sec 75%", "Token In / sec 99%", "Token In / sec max", + "Token Out / sec std", "Token Out / sec mean", "Token Out / sec min", "Token Out / sec 1%", "Token Out / sec 25%", "Token Out / sec 50%", "Token Out / sec 75%", "Token Out / sec 99%", "Token Out / sec max", + "Average Token / sec std", "Average Token / sec mean", "Average Token / sec min", "Average Token / sec 1%", "Average Token / sec 25%", "Average Token / sec 50%", "Average Token / sec 75%", "Average Token / sec 99%", "Average Token / sec max", + "No of Fail" + ] + df = pd.DataFrame(statistics, columns=columns) + + # Create the statistics directory if it doesn't exist + output_dir = "statistics" + os.makedirs(output_dir, exist_ok=True) + + # Write to Excel + output_file = os.path.join(output_dir, f"{model_name}_statistics.xlsx") + df.to_excel(output_file, index=False) + print(f"Statistics written to {output_file}") + +if __name__ == "__main__": + args = parse_arguments() + main(args.model_name) diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py new file mode 100644 index 0000000..d41023d --- /dev/null +++ b/benchmark/ellm_benchmark.py @@ -0,0 +1,117 @@ +import sys +import os +import time +import asyncio +import argparse +from loguru import logger + +# Add the 'src' directory to sys.path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src'))) + +# Import the engine module +from embeddedllm import engine +from embeddedllm import sampling_params + +async def benchmark(input_token_length, output_token_length, model_path, model_name, backend): + # Create the profile_model_timing directory if it doesn't exist + log_dir = "profile_model_timing" + os.makedirs(log_dir, exist_ok=True) + + log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log') + + # Add the log file to the logger (it will append if the file already exists) + logger.add(log_file, mode='a') + + # need different parameter for cpu and directml + if backend == "cpu": + device="cpu" + elif backend == "ipex": + device="xpu" + elif backend == "openvino": + device="gpu" + elif backend == "directml": + device = "" + + model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend) + + logger.info(f"Model: {model_name}") + + model.tokenizer.chat_template = "{% for message in messages %}{{ message['content']}}{% endfor %}" # Override + + prompt_text = """ + + """ + # Define the path to the file + file_path = "sampleText.txt" + + # Open the file and read its contents into the variable + with open(file_path, 'r') as file: + prompt_text = file.read() + + input_tokens = model.tokenizer.encode(prompt_text)[:input_token_length-1] + input_text = model.tokenizer.decode(input_tokens) + print(input_text) + input_tokens = model.tokenizer.encode(input_text) + print(len(input_tokens)) + + assert input_token_length-1 == len(input_tokens) + + PromptInputs = { + "prompt": input_text + } + + sampling_params_config = sampling_params.SamplingParams( + max_tokens=output_token_length, + top_p=0.1, + top_k=1, + temperature=1, + repetition_penalty=0.01, + ) + + start = time.perf_counter() + + async def generate(): + results = [] + async for response in model.generate( + inputs=PromptInputs, + sampling_params=sampling_params_config, + request_id="benchmark", + stream=True, + ): + results.append(response) + return results + + response = await generate() + end = time.perf_counter() + + logger.info(response[0]) # Access the generated text from the response + + total_time_taken = end - start + logger.info(f"Total time taken: {total_time_taken:.2f} seconds") + + average_tps = (input_token_length + output_token_length) / total_time_taken + logger.info("Average tps: "+ str(average_tps)) + + # Remove the logger to close the log file + logger.remove() + +def main(): + parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.") + parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, ipex, openvino or directml)') + parser.add_argument('--model_name', type=str, required=True, help='Name of the model') + parser.add_argument('--model_path', type=str, required=True, help='Path to the model') + parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)') + parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens') + + args = parser.parse_args() + + # Cap the input tokens to 2048 + if args.token_in > 2048: + print("Input tokens capped to 2048.") + args.token_in = 2048 + + # Run the async function using asyncio.run() + asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/benchmark/loop_analyse_detailed_benchmark.py b/benchmark/loop_analyse_detailed_benchmark.py new file mode 100644 index 0000000..e01bdda --- /dev/null +++ b/benchmark/loop_analyse_detailed_benchmark.py @@ -0,0 +1,20 @@ +import subprocess + +model_names = [ + # model names + +] + + +# Path to the ellm_benchmark.py script +analyse_detailed_benchmark_script = "analyse_detailed_benchmark.py" + +for model_name in model_names: + # Construct the command + command = [ + "python", analyse_detailed_benchmark_script, + "--model_name", model_name, + ] + + # Execute the command + subprocess.run(command) \ No newline at end of file diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py new file mode 100644 index 0000000..16cb47e --- /dev/null +++ b/benchmark/loop_ellm_benchmark.py @@ -0,0 +1,57 @@ +import subprocess + +# Define the models and token lengths +model_names = [ + # model names +] + +model_paths = [ + # path to model in order to model names / model repo id +] + +token_in_out = [ + (1024, 1024), + (1024, 512), + (1024, 256), + (1024, 128), + (512, 1024), + (512, 512), + (512, 256), + (512, 128), + (256, 1024), + (256, 512), + (256, 256), + (256, 128), + (128, 1024), + (128, 512), + (128, 256), + (128, 128), +] + +# Choose backend +# backend = "cpu" +# backend = "directml" +# backend = "ipex" +# backend = "openvino" + +# Number of loops +loop_count = 20 + +# Path to the ellm_benchmark.py script +ellm_benchmark_script = "ellm_benchmark.py" + +for model_name, model_path in zip(model_names, model_paths): + for input_token_length, output_token_length in token_in_out: + for i in range(loop_count): + # Construct the command + command = [ + "python", ellm_benchmark_script, + "--backend", backend, + "--model_name", model_name, + "--model_path", model_path, + "--token_in", str(input_token_length), + "--token_out", str(output_token_length) + ] + + # Execute the command + subprocess.run(command) diff --git a/benchmark/sampleText.txt b/benchmark/sampleText.txt new file mode 100644 index 0000000..3da3fbb --- /dev/null +++ b/benchmark/sampleText.txt @@ -0,0 +1,91 @@ +A large language model (LLM) is a computational model notable for its ability to achieve general-purpose language +generation and other natural language processing tasks such as classification. Based on language models, LLMs acquire +these abilities by learning statistical relationships from vast amounts of text during a computationally intensive +self-supervised and semi-supervised training process.[1] LLMs can be used for text generation, a form of generative AI, +by taking an input text and repeatedly predicting the next token or word.[2] + +LLMs are artificial neural networks that utilize the transformer architecture, invented in 2017. The largest and +most capable LLMs, as of June 2024, are built with a decoder-only transformer-based architecture, which enables +efficient processing and generation of large-scale text data. + +Historically, up to 2020, fine-tuning was the primary method used to adapt a model for specific tasks. However, +larger models such as GPT-3 have demonstrated the ability to achieve similar results through prompt engineering, +which involves crafting specific input prompts to guide the model's responses.[3] These models acquire knowledge +about syntax, semantics, and ontologies[4] inherent in human language corpora, but they also inherit inaccuracies +and biases present in the data they are trained on.[5] + +Some notable LLMs are OpenAI's GPT series of models (e.g., GPT-3.5 and GPT-4, used in ChatGPT and Microsoft Copilot), +Google's Gemini (the latter of which is currently used in the chatbot of the same name), Meta's LLaMA family of models, +Anthropic's Claude models, and Mistral AI's models. + +History +Before 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, +the IBM alignment models pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 +billion words achieved then-SOTA perplexity.[6] In the 2000s, as Internet use became prevalent, some researchers +constructed Internet-scale language datasets ("web as corpus"[7]), upon which they trained statistical language +models.[8][9] In 2009, in most language processing tasks, statistical language models dominated over symbolic +language models, as they can usefully ingest large datasets.[10] + +After neural networks became dominant in image processing around 2012, they were applied to language modelling as +well. Google converted its translation service to Neural Machine Translation in 2016. As it was before Transformers, +it was done by seq2seq deep LSTM networks. + + +An illustration of main components of the transformer model from the original paper, where layers were normalized +after (instead of before) multiheaded attention At the 2017 NeurIPS conference, Google researchers introduced the +transformer architecture in their landmark paper "Attention Is All You Need". This paper's goal was to improve upon +2014 Seq2seq technology,[11] and was based mainly on the attention mechanism developed by Bahdanau et al. in 2014. +[12] The following year in 2018, BERT was introduced and quickly became "ubiquitous".[13] Though the original +transformer has both encoder and decoder blocks, BERT is an encoder-only model. + +Although decoder-only GPT-1 was introduced in 2018, it was GPT-2 in 2019 that caught widespread attention because +OpenAI at first deemed it too powerful to release publicly, out of fear of malicious use.[14] GPT-3 in 2020 went +a step further and as of 2024 is available only via API with no offering of downloading the model to execute locally. +But it was the 2022 consumer-facing browser-based ChatGPT that captured the imaginations of the general population +and caused some media hype and online buzz.[15] The 2023 GPT-4 was praised for its increased accuracy and as a +"holy grail" for its multimodal capabilities.[16] OpenAI did not reveal high-level architecture and the number +of parameters of GPT-4. + +Competing language models have for the most part been attempting to equal the GPT series, at least in terms of +number of parameters.[17] + +Since 2022, source-available models have been gaining popularity, especially at first with BLOOM and LLaMA, though +both have restrictions on the field of use. Mistral AI's models Mistral 7B and Mixtral 8x7b have the more permissive +Apache License. As of June 2024, The Instruction fine tuned variant of the Llama 3 70 billion parameter model is +the most powerful open LLM according to the LMSYS Chatbot Arena Leaderboard, being more powerful than GPT-3.5 but +not as powerful as GPT-4.[18] + +As of 2024, the largest and most capable models are all based on the Transformer architecture. Some recent +implementations are based on other architectures, such as recurrent neural network variants and Mamba +(a state space model).[19][20][21] + +Dataset preprocessing +See also: List of datasets for machine-learning research ยง Internet +Probabilistic tokenization +Because machine learning algorithms process numbers rather than text, the text must be converted to numbers. +In the first step, a vocabulary is decided upon, then integer indexes are arbitrarily but uniquely assigned +to each vocabulary entry, and finally, an embedding is associated to the integer index. Algorithms include +byte-pair encoding and WordPiece. + +Probabilistic tokenization also compresses the datasets. Because LLMs generally require input to be an array +that is not jagged, the shorter texts must be "padded" until they match the length of the longest one. How many +tokens are, on average, needed per word depends on the language of the dataset.[22][23] + +BPE +Using a modification of byte-pair encoding, in the first step, all unique characters (including blanks and +punctuation marks) are treated as an initial set of n-grams (i.e. initial set of uni-grams). Successively +the most frequent pair of adjacent characters is merged into a bi-gram and all instances of the pair are +replaced by it. All occurrences of adjacent pairs of (previously merged) n-grams that most frequently occur +together are then again merged into even lengthier n-gram repeatedly until a vocabulary of prescribed size +is obtained (in case of GPT-3, the size is 50257).[24] Token vocabulary consists of integers, spanning from +zero up to the size of the token vocabulary. New words can always be interpreted as combinations of the +tokens and the initial-set uni-grams.[25] + +A token vocabulary based on the frequencies extracted from mainly English corpora uses as few tokens as +possible for an average English word. An average word in another language encoded by such an English-optimized +tokenizer is however split into suboptimal amount of tokens. GPT-2 tokenizer can use up to 15 times more tokens +per word for some languages, for example for the Shan language from Myanmar. Even more widespread languages +such as Portuguese and German have "a premium of 50%" compared to English.[26] + +For example, here is how tokenizer used by GPT-3 (Legacy) split the following sentence tokenizer: texts -> +series of numerical "tokens". \ No newline at end of file From 608670ce2af5cf531a0d6c9b92c75095ebbd95ed Mon Sep 17 00:00:00 2001 From: szeyu Date: Thu, 15 Aug 2024 15:11:14 +0800 Subject: [PATCH 11/21] update for the markdown to teach about benchmark code usage --- benchmark/benchmark.md | 81 ++++++++++++++++++++++++++++++++ benchmark/ellm_benchmark.py | 2 +- benchmark/loop_ellm_benchmark.py | 6 ++- 3 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 benchmark/benchmark.md diff --git a/benchmark/benchmark.md b/benchmark/benchmark.md new file mode 100644 index 0000000..ef8f1bd --- /dev/null +++ b/benchmark/benchmark.md @@ -0,0 +1,81 @@ +# Benchmark + +## Benchmark a Model +To benchmark a model, run this +* --backend `cpu` | `ipex` | `openvino` | `directml` +* --model_name `Name of the Model` +* --model_path `Path to Model` | `Model Repo ID` +* --token_in `Number of Input Tokens (Max 2048)` +* --token_out `Number of Output Tokens` + +```shell +python ellm_benchmark.py --backend --model_name --model_path --token_in --token_out +``` + + +## Loop to benchmark the models +Customise your benchmarking config +```python +# Define the models +model_names = [ + # model names + +] + +# Define the model paths +model_paths = [ + # path to model in order to model names / model repo id + +] + +# Define the token length +token_in_out = [ + (1024, 1024), + (1024, 512), + (1024, 256), + (1024, 128), + (512, 1024), + (512, 512), + (512, 256), + (512, 128), + (256, 1024), + (256, 512), + (256, 256), + (256, 128), + (128, 1024), + (128, 512), + (128, 256), + (128, 128), +] + +# Choose backend +backend = "cpu" +backend = "directml" +backend = "ipex" +backend = "openvino" + +# Number of loops +loop_count = 20 +``` +```shell +python loop_ellm_benchmark.py +``` + +## Generate a Report (`XLSX`) of a Model's Benchmark +To Generate report for a model, run this +* --model_name `Name of the Model` +```shell +python analyse_detailed_benchmark.py --model_name +``` + +## Generate Reports (`XLSX`) of Models' Benchmark +List out the models that you want to have report of benchmarking +```python +model_names = [ + # model names + +] +``` +```shell +python loop_analyse_detailed_benchmark.py +``` \ No newline at end of file diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py index d41023d..d46fd0a 100644 --- a/benchmark/ellm_benchmark.py +++ b/benchmark/ellm_benchmark.py @@ -99,7 +99,7 @@ def main(): parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.") parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, ipex, openvino or directml)') parser.add_argument('--model_name', type=str, required=True, help='Name of the model') - parser.add_argument('--model_path', type=str, required=True, help='Path to the model') + parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id') parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)') parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens') diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py index 16cb47e..a385f55 100644 --- a/benchmark/loop_ellm_benchmark.py +++ b/benchmark/loop_ellm_benchmark.py @@ -1,14 +1,18 @@ import subprocess -# Define the models and token lengths +# Define the models model_names = [ # model names + ] +# Define the model paths model_paths = [ # path to model in order to model names / model repo id + ] +# Define the token length token_in_out = [ (1024, 1024), (1024, 512), From 21d95aa54eb0b1e8cff9adcf8c70b7ead9da1c3e Mon Sep 17 00:00:00 2001 From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:13:33 +0800 Subject: [PATCH 12/21] Rename benchmark.md to README.md --- benchmark/{benchmark.md => README.md} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename benchmark/{benchmark.md => README.md} (99%) diff --git a/benchmark/benchmark.md b/benchmark/README.md similarity index 99% rename from benchmark/benchmark.md rename to benchmark/README.md index ef8f1bd..e8ef2e9 100644 --- a/benchmark/benchmark.md +++ b/benchmark/README.md @@ -78,4 +78,4 @@ model_names = [ ``` ```shell python loop_analyse_detailed_benchmark.py -``` \ No newline at end of file +``` From a038376a6d7e5504d48f6d74da3a44b2a3c2a0b3 Mon Sep 17 00:00:00 2001 From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:22:04 +0800 Subject: [PATCH 13/21] Update README.md --- benchmark/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmark/README.md b/benchmark/README.md index e8ef2e9..cc32b4f 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -1,4 +1,5 @@ # Benchmark +Allow users to test on themselves to get the benchmark of model(s) on different backend. It will analyse the Token In / Out throughput for you in a statistical manner ## Benchmark a Model To benchmark a model, run this From ca93ba991680c8b5e7b6fcb1957982cb7845276a Mon Sep 17 00:00:00 2001 From: szeyu Date: Thu, 15 Aug 2024 15:51:40 +0800 Subject: [PATCH 14/21] fixed the bias for encode and output_token_length for openvino --- benchmark/ellm_benchmark.py | 14 +++++++++---- benchmark/loop_ellm_benchmark.py | 36 ++++++++++++++++---------------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py index d46fd0a..ab73107 100644 --- a/benchmark/ellm_benchmark.py +++ b/benchmark/ellm_benchmark.py @@ -22,6 +22,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n # Add the log file to the logger (it will append if the file already exists) logger.add(log_file, mode='a') + encode_bias = 0 + output_token_bias = 0 # need different parameter for cpu and directml if backend == "cpu": device="cpu" @@ -29,6 +31,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n device="xpu" elif backend == "openvino": device="gpu" + encode_bias = 2 + output_token_bias = 1 elif backend == "directml": device = "" @@ -48,20 +52,22 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n with open(file_path, 'r') as file: prompt_text = file.read() - input_tokens = model.tokenizer.encode(prompt_text)[:input_token_length-1] + input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length - encode_bias)] input_text = model.tokenizer.decode(input_tokens) print(input_text) input_tokens = model.tokenizer.encode(input_text) - print(len(input_tokens)) + + print("input_tokens:",len(input_tokens)) + print("input_token_length:",input_token_length) - assert input_token_length-1 == len(input_tokens) + assert input_token_length == len(input_tokens) PromptInputs = { "prompt": input_text } sampling_params_config = sampling_params.SamplingParams( - max_tokens=output_token_length, + max_tokens=(output_token_length - output_token_bias), top_p=0.1, top_k=1, temperature=1, diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py index a385f55..b6e3aaf 100644 --- a/benchmark/loop_ellm_benchmark.py +++ b/benchmark/loop_ellm_benchmark.py @@ -3,32 +3,32 @@ # Define the models model_names = [ # model names - + "Phi-3-mini-4k-instruct-int8-ov" ] # Define the model paths model_paths = [ # path to model in order to model names / model repo id - + "OpenVINO/Phi-3-mini-4k-instruct-int8-ov" ] # Define the token length token_in_out = [ - (1024, 1024), - (1024, 512), - (1024, 256), - (1024, 128), - (512, 1024), - (512, 512), - (512, 256), - (512, 128), - (256, 1024), - (256, 512), - (256, 256), - (256, 128), - (128, 1024), - (128, 512), - (128, 256), + # (1024, 1024), + # (1024, 512), + # (1024, 256), + # (1024, 128), + # (512, 1024), + # (512, 512), + # (512, 256), + # (512, 128), + # (256, 1024), + # (256, 512), + # (256, 256), + # (256, 128), + # (128, 1024), + # (128, 512), + # (128, 256), (128, 128), ] @@ -36,7 +36,7 @@ # backend = "cpu" # backend = "directml" # backend = "ipex" -# backend = "openvino" +backend = "openvino" # Number of loops loop_count = 20 From 632d651453b16ede2f4b2e1b1e12625a2ba228cf Mon Sep 17 00:00:00 2001 From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com> Date: Thu, 15 Aug 2024 15:54:47 +0800 Subject: [PATCH 15/21] Update loop_ellm_benchmark.py --- benchmark/loop_ellm_benchmark.py | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py index b6e3aaf..c9fe8c3 100644 --- a/benchmark/loop_ellm_benchmark.py +++ b/benchmark/loop_ellm_benchmark.py @@ -3,32 +3,32 @@ # Define the models model_names = [ # model names - "Phi-3-mini-4k-instruct-int8-ov" + ] # Define the model paths model_paths = [ # path to model in order to model names / model repo id - "OpenVINO/Phi-3-mini-4k-instruct-int8-ov" + ] # Define the token length token_in_out = [ - # (1024, 1024), - # (1024, 512), - # (1024, 256), - # (1024, 128), - # (512, 1024), - # (512, 512), - # (512, 256), - # (512, 128), - # (256, 1024), - # (256, 512), - # (256, 256), - # (256, 128), - # (128, 1024), - # (128, 512), - # (128, 256), + (1024, 1024), + (1024, 512), + (1024, 256), + (1024, 128), + (512, 1024), + (512, 512), + (512, 256), + (512, 128), + (256, 1024), + (256, 512), + (256, 256), + (256, 128), + (128, 1024), + (128, 512), + (128, 256), (128, 128), ] @@ -36,7 +36,7 @@ # backend = "cpu" # backend = "directml" # backend = "ipex" -backend = "openvino" +# backend = "openvino" # Number of loops loop_count = 20 From e51527d9d6feec35ebe5fe2d9daff8813cf518e3 Mon Sep 17 00:00:00 2001 From: szeyusim Date: Thu, 15 Aug 2024 16:08:03 +0800 Subject: [PATCH 16/21] add prompt bias to fix the token encode margin error for directml --- benchmark/ellm_benchmark.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py index ab73107..7d7aead 100644 --- a/benchmark/ellm_benchmark.py +++ b/benchmark/ellm_benchmark.py @@ -23,6 +23,7 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n logger.add(log_file, mode='a') encode_bias = 0 + prompt_bias = 0 output_token_bias = 0 # need different parameter for cpu and directml if backend == "cpu": @@ -35,6 +36,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n output_token_bias = 1 elif backend == "directml": device = "" + encode_bias = 1 + prompt_bias = 1 model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend) @@ -57,10 +60,10 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n print(input_text) input_tokens = model.tokenizer.encode(input_text) - print("input_tokens:",len(input_tokens)) + print("input_tokens:",(prompt_bias + len(input_tokens))) print("input_token_length:",input_token_length) - assert input_token_length == len(input_tokens) + assert input_token_length == (prompt_bias + len(input_tokens)) PromptInputs = { "prompt": input_text From 013adc4d1877a0b39b98cec18d25b68d59138324 Mon Sep 17 00:00:00 2001 From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:13:25 +0800 Subject: [PATCH 17/21] Update ellm_benchmark.py --- benchmark/ellm_benchmark.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py index 7d7aead..7771768 100644 --- a/benchmark/ellm_benchmark.py +++ b/benchmark/ellm_benchmark.py @@ -12,7 +12,7 @@ from embeddedllm import engine from embeddedllm import sampling_params -async def benchmark(input_token_length, output_token_length, model_path, model_name, backend): +async def benchmark(input_token_length, output_token_length, model_path, model_name, backend, input_token_bias=0, output_token_bias=0): # Create the profile_model_timing directory if it doesn't exist log_dir = "profile_model_timing" os.makedirs(log_dir, exist_ok=True) @@ -22,9 +22,6 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n # Add the log file to the logger (it will append if the file already exists) logger.add(log_file, mode='a') - encode_bias = 0 - prompt_bias = 0 - output_token_bias = 0 # need different parameter for cpu and directml if backend == "cpu": device="cpu" @@ -32,12 +29,8 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n device="xpu" elif backend == "openvino": device="gpu" - encode_bias = 2 - output_token_bias = 1 elif backend == "directml": device = "" - encode_bias = 1 - prompt_bias = 1 model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend) @@ -55,22 +48,17 @@ async def benchmark(input_token_length, output_token_length, model_path, model_n with open(file_path, 'r') as file: prompt_text = file.read() - input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length - encode_bias)] + input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length + input_token_bias)] input_text = model.tokenizer.decode(input_tokens) print(input_text) input_tokens = model.tokenizer.encode(input_text) - - print("input_tokens:",(prompt_bias + len(input_tokens))) - print("input_token_length:",input_token_length) - - assert input_token_length == (prompt_bias + len(input_tokens)) PromptInputs = { "prompt": input_text } sampling_params_config = sampling_params.SamplingParams( - max_tokens=(output_token_length - output_token_bias), + max_tokens=(output_token_length + output_token_bias), top_p=0.1, top_k=1, temperature=1, @@ -111,6 +99,8 @@ def main(): parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id') parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)') parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens') + parser.add_argument('--input_token_bias', type=int, required=False, help='Adjust the input token length') + parser.add_argument('--output_token_bias', type=int, required=False, help='Adjust the output token length') args = parser.parse_args() @@ -120,7 +110,7 @@ def main(): args.token_in = 2048 # Run the async function using asyncio.run() - asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend)) + asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend, args.input_token_bias, args.output_token_bias)) if __name__ == "__main__": - main() \ No newline at end of file + main() From 62e0b2cee90d0e3d03b5d848bf7478acbeb2b664 Mon Sep 17 00:00:00 2001 From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:14:54 +0800 Subject: [PATCH 18/21] Update loop_ellm_benchmark.py --- benchmark/loop_ellm_benchmark.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py index c9fe8c3..658326e 100644 --- a/benchmark/loop_ellm_benchmark.py +++ b/benchmark/loop_ellm_benchmark.py @@ -41,6 +41,10 @@ # Number of loops loop_count = 20 +# input and output token bias +input_token_bias = 0 +output_token_bias = 0 + # Path to the ellm_benchmark.py script ellm_benchmark_script = "ellm_benchmark.py" @@ -54,7 +58,9 @@ "--model_name", model_name, "--model_path", model_path, "--token_in", str(input_token_length), - "--token_out", str(output_token_length) + "--token_out", str(output_token_length), + "--input_token_bias", str(input_token_bias), + "--output_token_bias", str(output_token_bias) ] # Execute the command From 03cc6b7aa52967c1df2643f9c64cd270af6f36ff Mon Sep 17 00:00:00 2001 From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com> Date: Fri, 16 Aug 2024 10:16:53 +0800 Subject: [PATCH 19/21] Update README.md --- benchmark/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmark/README.md b/benchmark/README.md index cc32b4f..710b602 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -8,9 +8,11 @@ To benchmark a model, run this * --model_path `Path to Model` | `Model Repo ID` * --token_in `Number of Input Tokens (Max 2048)` * --token_out `Number of Output Tokens` +* --input_token_bias `Adjust the input token` +* --output_token_bias `Adjust the output token` ```shell -python ellm_benchmark.py --backend --model_name --model_path --token_in --token_out +python ellm_benchmark.py --backend --model_name --model_path --token_in --token_out --input_token_bias --output_token_bias ``` @@ -57,6 +59,10 @@ backend = "openvino" # Number of loops loop_count = 20 + +# input and output token bias +input_token_bias = 0 +output_token_bias = 0 ``` ```shell python loop_ellm_benchmark.py From 4998e2cf889f05aaa298e8881cbf0ab49ced3065 Mon Sep 17 00:00:00 2001 From: szeyu Date: Mon, 2 Sep 2024 15:22:40 +0800 Subject: [PATCH 20/21] update the benchmark loop to loop without having the model load again in every loop --- benchmark/README.md | 3 +- benchmark/ellm_benchmark.py | 70 ++++++++++++++++++++------------ benchmark/loop_ellm_benchmark.py | 6 ++- 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/benchmark/README.md b/benchmark/README.md index 710b602..f09ffc3 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -10,9 +10,10 @@ To benchmark a model, run this * --token_out `Number of Output Tokens` * --input_token_bias `Adjust the input token` * --output_token_bias `Adjust the output token` +* --loop_count `Adjust the loop count` ```shell -python ellm_benchmark.py --backend --model_name --model_path --token_in --token_out --input_token_bias --output_token_bias +python ellm_benchmark.py --backend --model_name --model_path --token_in --token_out --input_token_bias --output_token_bias --loop_count ``` diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py index 7771768..12a2822 100644 --- a/benchmark/ellm_benchmark.py +++ b/benchmark/ellm_benchmark.py @@ -12,28 +12,8 @@ from embeddedllm import engine from embeddedllm import sampling_params -async def benchmark(input_token_length, output_token_length, model_path, model_name, backend, input_token_bias=0, output_token_bias=0): - # Create the profile_model_timing directory if it doesn't exist - log_dir = "profile_model_timing" - os.makedirs(log_dir, exist_ok=True) - - log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log') - - # Add the log file to the logger (it will append if the file already exists) - logger.add(log_file, mode='a') - - # need different parameter for cpu and directml - if backend == "cpu": - device="cpu" - elif backend == "ipex": - device="xpu" - elif backend == "openvino": - device="gpu" - elif backend == "directml": - device = "" - - model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend) - +async def benchmark(model, input_token_length, output_token_length, model_name, input_token_bias=0, output_token_bias=0): + logger.info(f"Model: {model_name}") model.tokenizer.chat_template = "{% for message in messages %}{{ message['content']}}{% endfor %}" # Override @@ -89,28 +69,64 @@ async def generate(): average_tps = (input_token_length + output_token_length) / total_time_taken logger.info("Average tps: "+ str(average_tps)) - # Remove the logger to close the log file - logger.remove() + def main(): parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.") - parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, ipex, openvino or directml)') + parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'npu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, npu, ipex, openvino or directml)') parser.add_argument('--model_name', type=str, required=True, help='Name of the model') parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id') parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)') parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens') parser.add_argument('--input_token_bias', type=int, required=False, help='Adjust the input token length') parser.add_argument('--output_token_bias', type=int, required=False, help='Adjust the output token length') + parser.add_argument('--loop_count', type=int, required=False, help='Adjust the loop count') args = parser.parse_args() + backend = args.backend + model_path = args.model_path + model_name = args.model_name + token_in = args.token_in + token_out = args.token_out + input_token_bias = args.input_token_bias + output_token_bias = args.output_token_bias + loop_count = args.loop_count + # Cap the input tokens to 2048 if args.token_in > 2048: print("Input tokens capped to 2048.") args.token_in = 2048 - # Run the async function using asyncio.run() - asyncio.run(benchmark(args.token_in, args.token_out, args.model_path, args.model_name, args.backend, args.input_token_bias, args.output_token_bias)) + # Create the profile_model_timing directory if it doesn't exist + log_dir = "profile_model_timing" + os.makedirs(log_dir, exist_ok=True) + + log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{token_in}_{token_out}.log') + + # Add the log file to the logger + logger.add(log_file, mode='w') + + # need different parameter for cpu and directml + if backend == "cpu": + device="cpu" + elif backend == "npu": + device="npu" + elif backend == "ipex": + device="xpu" + elif backend == "openvino": + device="gpu" + elif backend == "directml": + device = "" + + model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend) + + for _ in range(loop_count): + # Run the async function using asyncio.run() + asyncio.run(benchmark(model, token_in, token_out, model_name, input_token_bias, output_token_bias)) + + # Remove the logger to close the log file + logger.remove() if __name__ == "__main__": main() diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py index 658326e..d173e8a 100644 --- a/benchmark/loop_ellm_benchmark.py +++ b/benchmark/loop_ellm_benchmark.py @@ -37,9 +37,10 @@ # backend = "directml" # backend = "ipex" # backend = "openvino" +# backend = "npu" # Number of loops -loop_count = 20 +loop_count = 3 # input and output token bias input_token_bias = 0 @@ -60,7 +61,8 @@ "--token_in", str(input_token_length), "--token_out", str(output_token_length), "--input_token_bias", str(input_token_bias), - "--output_token_bias", str(output_token_bias) + "--output_token_bias", str(output_token_bias), + "--loop_count", str(loop_count) ] # Execute the command From 769e558735b4e4d371a861dbd1c8d3c5ddc7bcc0 Mon Sep 17 00:00:00 2001 From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:39:31 +0800 Subject: [PATCH 21/21] Update loop_ellm_benchmark.py --- benchmark/loop_ellm_benchmark.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/benchmark/loop_ellm_benchmark.py b/benchmark/loop_ellm_benchmark.py index d173e8a..f78c50f 100644 --- a/benchmark/loop_ellm_benchmark.py +++ b/benchmark/loop_ellm_benchmark.py @@ -51,19 +51,18 @@ for model_name, model_path in zip(model_names, model_paths): for input_token_length, output_token_length in token_in_out: - for i in range(loop_count): - # Construct the command - command = [ - "python", ellm_benchmark_script, - "--backend", backend, - "--model_name", model_name, - "--model_path", model_path, - "--token_in", str(input_token_length), - "--token_out", str(output_token_length), - "--input_token_bias", str(input_token_bias), - "--output_token_bias", str(output_token_bias), - "--loop_count", str(loop_count) - ] + # Construct the command + command = [ + "python", ellm_benchmark_script, + "--backend", backend, + "--model_name", model_name, + "--model_path", model_path, + "--token_in", str(input_token_length), + "--token_out", str(output_token_length), + "--input_token_bias", str(input_token_bias), + "--output_token_bias", str(output_token_bias), + "--loop_count", str(loop_count) + ] - # Execute the command - subprocess.run(command) + # Execute the command + subprocess.run(command)