EmbeddedLLM · szeyu · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/README.md b/README.md
@@ -21,7 +21,8 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 ## Table Content
 
 - [Supported Models](#supported-models-quick-start)
-  - [Onnxruntime Models](./docs/model/onnxruntime_models.md)
+  - [Onnxruntime DirectML Models](./docs/model/onnxruntime_directml_models.md)
+  - [Onnxruntime CPU Models](./docs/model/onnxruntime_cpu_models.md)
   - [Ipex-LLM Models](./docs/model/ipex_models.md)
 - [Getting Started](#getting-started)
   - [Installation From Source](#installation)
@@ -39,7 +40,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 | Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
 | Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
 | Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
+| Llama-3-8b-chat | 8B | 8192 | [luweigen/Llama-3-8B-Instruct-int4-onnx-directml](https://huggingface.co/luweigen/Llama-3-8B-Instruct-int4-onnx-directml) |
 | Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
 | Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
 | Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |

diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,89 @@
+# Benchmark
+Allow users to test on themselves to get the benchmark of model(s) on different backend. It will analyse the Token In / Out throughput for you in a statistical manner
+
+## Benchmark a Model
+To benchmark a model, run this
+* --backend `cpu` | `ipex` | `openvino` | `directml`
+* --model_name `Name of the Model`
+* --model_path `Path to Model` | `Model Repo ID`
+* --token_in `Number of Input Tokens (Max 2048)`
+* --token_out `Number of Output Tokens`
+* --input_token_bias `Adjust the input token`
+* --output_token_bias `Adjust the output token`
+* --loop_count `Adjust the loop count`
+
+```shell
+python ellm_benchmark.py --backend <cpu | ipex | openvino | directml> --model_name <Name of the Model> --model_path <Path to Model | Model Repo ID> --token_in <Number of Input Tokens (Max 2048)> --token_out <Number of Output Tokens> --input_token_bias <int value> --output_token_bias <int value> --loop_count <int value>
+```
+
+
+## Loop to benchmark the models
+Customise your benchmarking config
+```python
+# Define the models
+model_names = [
+    # model names
+
+]
+
+# Define the model paths
+model_paths = [
+    # path to model in order to model names / model repo id
+
+]
+
+# Define the token length
+token_in_out = [
+    (1024, 1024),
+    (1024, 512),
+    (1024, 256),
+    (1024, 128),
+    (512, 1024),
+    (512, 512),
+    (512, 256),
+    (512, 128),
+    (256, 1024),
+    (256, 512),
+    (256, 256),
+    (256, 128),
+    (128, 1024),
+    (128, 512),
+    (128, 256),
+    (128, 128),
+]
+
+# Choose backend
+backend = "cpu"
+backend = "directml"
+backend = "ipex"
+backend = "openvino"
+
+# Number of loops
+loop_count = 20
+
+# input and output token bias
+input_token_bias = 0
+output_token_bias = 0
+```
+```shell
+python loop_ellm_benchmark.py
+```
+
+## Generate a Report (`XLSX`) of a Model's Benchmark
+To Generate report for a model, run this
+* --model_name `Name of the Model`
+```shell
+python analyse_detailed_benchmark.py --model_name <Name of the Model>
+```
+
+## Generate Reports (`XLSX`) of Models' Benchmark
+List out the models that you want to have report of benchmarking
+```python
+model_names = [
+    # model names
+
+]
+```
+```shell
+python loop_analyse_detailed_benchmark.py
+```
diff --git a/benchmark/analyse_detailed_benchmark.py b/benchmark/analyse_detailed_benchmark.py
@@ -0,0 +1,124 @@
+import os
+import re
+import numpy as np
+import pandas as pd
+import argparse
+
+def extract_data_from_log(log_file):
+    average_tps_list = []
+    prompt_tokens_per_second_list = []
+    new_tokens_per_second_list = []
+    error_count = 0
+    error_state = False
+
+    if not os.path.exists(log_file):
+        print(f"Log file does not exist: {log_file}")
+        return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count
+
+    with open(log_file, 'r') as file:
+        for line in file:
+            if "ERROR" in line:
+                error_count += 1
+                error_state = True
+                continue
+
+            if "Average tps" in line and error_state == True:
+                error_state = False
+                continue
+
+            if "Average tps" in line:
+                average_tps = float(re.search(r"Average tps: ([\d.]+)", line).group(1))
+                average_tps_list.append(average_tps)
+                continue
+
+            if "Prompt tokens per second" in line:
+                prompt_tokens_per_second = float(re.search(r"Prompt tokens per second: ([\d.]+)", line).group(1))
+                prompt_tokens_per_second_list.append(prompt_tokens_per_second)
+            if "New tokens per second" in line:
+                new_tokens_per_second = float(re.search(r"New tokens per second: ([\d.]+)", line).group(1))
+                new_tokens_per_second_list.append(new_tokens_per_second)
+
+    return average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count
+
+def calculate_statistics(data):
+    data_np = np.array(data)
+    stats = {
+        "std": np.std(data_np, ddof=1),  # Sample standard deviation
+        "mean": np.mean(data_np),
+        "min": np.min(data_np),
+        "1%": np.percentile(data_np, 1),
+        "25%": np.percentile(data_np, 25),
+        "50%": np.percentile(data_np, 50),  # Median
+        "75%": np.percentile(data_np, 75),
+        "99%": np.percentile(data_np, 99),
+        "max": np.max(data_np)
+    }
+    return stats
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Process log files and generate statistics.")
+    parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
+    return parser.parse_args()
+
+def main(model_name):
+    token_ins = [128, 256, 512, 1024]
+    token_outs = [128, 256, 512, 1024]
+
+    statistics = []
+
+    # Create the profile_model_timing directory if it doesn't exist
+    log_dir = "profile_model_timing"
+    os.makedirs(log_dir, exist_ok=True)
+
+    for input_token_length in token_ins:
+        for output_token_length in token_outs:
+            log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{input_token_length}_{output_token_length}.log')
+            average_tps_list, prompt_tokens_per_second_list, new_tokens_per_second_list, error_count = extract_data_from_log(log_file)
+
+            if not average_tps_list and not prompt_tokens_per_second_list and not new_tokens_per_second_list:
+                # Log file does not exist or is empty, append "-" for each statistical value
+                statistics.append([
+                    model_name, input_token_length, output_token_length,
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    "-", "-", "-", "-", "-", "-", "-", "-", "-",
+                    error_count
+                ])
+            else:
+                min_len = min(len(average_tps_list), len(prompt_tokens_per_second_list), len(new_tokens_per_second_list))
+
+                if min_len > 0:
+                    prompt_stats = calculate_statistics(prompt_tokens_per_second_list[5:min_len])
+                    new_token_stats = calculate_statistics(new_tokens_per_second_list[5:min_len])
+                    average_tps_stats = calculate_statistics(average_tps_list[5:min_len])
+
+                    statistics.append([
+                        model_name, input_token_length, output_token_length,
+                        prompt_stats["std"], prompt_stats["mean"], prompt_stats["min"], prompt_stats["1%"], prompt_stats["25%"], prompt_stats["50%"], prompt_stats["75%"], prompt_stats["99%"], prompt_stats["max"],
+                        new_token_stats["std"], new_token_stats["mean"], new_token_stats["min"], new_token_stats["1%"], new_token_stats["25%"], new_token_stats["50%"], new_token_stats["75%"], new_token_stats["99%"], new_token_stats["max"],
+                        average_tps_stats["std"], average_tps_stats["mean"], average_tps_stats["min"], average_tps_stats["1%"], average_tps_stats["25%"], average_tps_stats["50%"], average_tps_stats["75%"], average_tps_stats["99%"], average_tps_stats["max"],
+                        error_count
+                    ])
+
+    # Create a DataFrame
+    columns = [
+        "Model", "Token In", "Token Out",
+        "Token In / sec std", "Token In / sec mean", "Token In / sec min", "Token In / sec 1%", "Token In / sec 25%", "Token In / sec 50%", "Token In / sec 75%", "Token In / sec 99%", "Token In / sec max",
+        "Token Out / sec std", "Token Out / sec mean", "Token Out / sec min", "Token Out / sec 1%", "Token Out / sec 25%", "Token Out / sec 50%", "Token Out / sec 75%", "Token Out / sec 99%", "Token Out / sec max",
+        "Average Token / sec std", "Average Token / sec mean", "Average Token / sec min", "Average Token / sec 1%", "Average Token / sec 25%", "Average Token / sec 50%", "Average Token / sec 75%", "Average Token / sec 99%", "Average Token / sec max",
+        "No of Fail"
+    ]
+    df = pd.DataFrame(statistics, columns=columns)
+
+    # Create the statistics directory if it doesn't exist
+    output_dir = "statistics"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Write to Excel
+    output_file = os.path.join(output_dir, f"{model_name}_statistics.xlsx")
+    df.to_excel(output_file, index=False)
+    print(f"Statistics written to {output_file}")
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    main(args.model_name)
diff --git a/benchmark/ellm_benchmark.py b/benchmark/ellm_benchmark.py
@@ -0,0 +1,132 @@
+import sys
+import os
+import time
+import asyncio
+import argparse
+from loguru import logger
+
+# Add the 'src' directory to sys.path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src')))
+
+# Import the engine module
+from embeddedllm import engine
+from embeddedllm import sampling_params
+
+async def benchmark(model, input_token_length, output_token_length, model_name, input_token_bias=0, output_token_bias=0):
+
+    logger.info(f"Model: {model_name}")
+
+    model.tokenizer.chat_template = "{% for message in messages %}{{  message['content']}}{% endfor %}"  # Override
+
+    prompt_text = """
+
+    """
+    # Define the path to the file
+    file_path = "sampleText.txt"
+
+    # Open the file and read its contents into the variable
+    with open(file_path, 'r') as file:
+        prompt_text = file.read()
+
+    input_tokens = model.tokenizer.encode(prompt_text)[:(input_token_length + input_token_bias)]
+    input_text = model.tokenizer.decode(input_tokens)
+    print(input_text)
+    input_tokens = model.tokenizer.encode(input_text)
+
+    PromptInputs = {
+        "prompt": input_text
+    }
+
+    sampling_params_config = sampling_params.SamplingParams(
+        max_tokens=(output_token_length + output_token_bias),
+        top_p=0.1,
+        top_k=1,
+        temperature=1,
+        repetition_penalty=0.01,
+    )
+
+    start = time.perf_counter()
+
+    async def generate():
+        results = []
+        async for response in model.generate(
+            inputs=PromptInputs,
+            sampling_params=sampling_params_config,
+            request_id="benchmark",
+            stream=True,
+        ):
+            results.append(response)
+        return results
+
+    response = await generate()
+    end = time.perf_counter()
+
+    logger.info(response[0])  # Access the generated text from the response
+
+    total_time_taken = end - start
+    logger.info(f"Total time taken: {total_time_taken:.2f} seconds")
+
+    average_tps = (input_token_length + output_token_length) / total_time_taken
+    logger.info("Average tps: "+ str(average_tps))
+
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark EmbeddedLLM models.")
+    parser.add_argument('--backend', type=str, required=True, choices=['cpu', 'npu', 'directml', 'openvino', 'ipex'], help='Backend to use (cpu, npu, ipex, openvino or directml)')
+    parser.add_argument('--model_name', type=str, required=True, help='Name of the model')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the model or model repo id')
+    parser.add_argument('--token_in', type=int, required=True, help='Number of input tokens (max 2048)')
+    parser.add_argument('--token_out', type=int, required=True, help='Number of output tokens')
+    parser.add_argument('--input_token_bias', type=int, required=False, help='Adjust the input token length')
+    parser.add_argument('--output_token_bias', type=int, required=False, help='Adjust the output token length')
+    parser.add_argument('--loop_count', type=int, required=False, help='Adjust the loop count')
+
+    args = parser.parse_args()
+
+    backend = args.backend
+    model_path = args.model_path
+    model_name = args.model_name
+    token_in = args.token_in
+    token_out = args.token_out
+    input_token_bias = args.input_token_bias
+    output_token_bias = args.output_token_bias
+    loop_count = args.loop_count
+
+    # Cap the input tokens to 2048
+    if args.token_in > 2048:
+        print("Input tokens capped to 2048.")
+        args.token_in = 2048
+
+    # Create the profile_model_timing directory if it doesn't exist
+    log_dir = "profile_model_timing"
+    os.makedirs(log_dir, exist_ok=True)
+
+    log_file = os.path.join(log_dir, f'profile_model_timing_{model_name}_{token_in}_{token_out}.log')
+
+    # Add the log file to the logger
+    logger.add(log_file, mode='w')
+
+    # need different parameter for cpu and directml
+    if backend == "cpu":
+        device="cpu"
+    elif backend == "npu":
+        device="npu"
+    elif backend == "ipex":
+        device="xpu"
+    elif backend == "openvino":
+        device="gpu"
+    elif backend == "directml":
+        device = ""
+
+    model = engine.EmbeddedLLMEngine(model_path=model_path, vision=False, device=device, backend=backend)
+
+    for _ in range(loop_count):
+        # Run the async function using asyncio.run()
+        asyncio.run(benchmark(model, token_in, token_out, model_name, input_token_bias, output_token_bias))
+
+    # Remove the logger to close the log file
+    logger.remove()
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/loop_analyse_detailed_benchmark.py b/benchmark/loop_analyse_detailed_benchmark.py
@@ -0,0 +1,20 @@
+import subprocess
+
+model_names = [
+    # model names
+
+]
+
+
+# Path to the ellm_benchmark.py script
+analyse_detailed_benchmark_script = "analyse_detailed_benchmark.py"
+
+for model_name in model_names:
+    # Construct the command
+    command = [
+        "python", analyse_detailed_benchmark_script,
+        "--model_name", model_name,
+    ]
+
+    # Execute the command
+    subprocess.run(command)