diff --git a/benchmark/benchmark-7b.sh b/benchmark/benchmark-7b.sh index ebedeef0a4..16a90971f4 100755 --- a/benchmark/benchmark-7b.sh +++ b/benchmark/benchmark-7b.sh @@ -1,4 +1,9 @@ #!/bin/bash +if [ - z "$1" ] +then + echo +fi + tp=1 model_name=llama2 model_path=/workspace/models-140/llama2/huggingface/llama-2-7b-chat/ @@ -16,12 +21,12 @@ apt-get install crudini crudini --set ${config_path} llama max_context_token_num 4 crudini --set ${config_path} llama cache_chunk_size -1 crudini --set ${config_path} llama cache_max_entry_count 1000 -crudini --set ${config_path} llama max_batch_size 256 +crudini --set ${config_path} llama max_batch_size 128 # end of update config benchmark_rpm () { output_path=$1 - mkdir -p ${output_path} + mkdir -p "${output_path}" batches=(64 128) for batch in "${batches[@]}" @@ -40,6 +45,8 @@ benchmark_rpm () { benchmark_generation () { output_path=$1 + mkdir -p "${output_path}" + python3 benchmark/profile_generation.py \ ${turbomind_model_path} \ --concurrency 1 16 32 64 \ diff --git a/benchmark/benchmark_13b.sh b/benchmark/benchmark_13b.sh index 983da3762a..855e2b1fe5 100755 --- a/benchmark/benchmark_13b.sh +++ b/benchmark/benchmark_13b.sh @@ -16,12 +16,12 @@ apt-get install crudini crudini --set ${config_path} llama max_context_token_num 4 crudini --set ${config_path} llama cache_chunk_size -1 crudini --set ${config_path} llama cache_max_entry_count 500 -crudini --set ${config_path} llama max_batch_size 256 +crudini --set ${config_path} llama max_batch_size 128 # end of update config benchmark_rpm () { output_path=$1 - mkdir -p ${output_path} + mkdir -p "${output_path}" batches=(64 128) for batch in "${batches[@]}" @@ -40,6 +40,8 @@ benchmark_rpm () { benchmark_generation () { output_path=$1 + mkdir -p "${output_path}" + python3 benchmark/profile_generation.py \ ${turbomind_model_path} \ --concurrency 1 16 32 64 \ diff --git a/benchmark/benchmark_20b.sh b/benchmark/benchmark_20b.sh index 1e5346c060..d6fe00d32a 100755 --- a/benchmark/benchmark_20b.sh +++ b/benchmark/benchmark_20b.sh @@ -16,12 +16,12 @@ apt-get install crudini crudini --set ${config_path} llama max_context_token_num 4 crudini --set ${config_path} llama cache_chunk_size -1 crudini --set ${config_path} llama cache_max_entry_count 700 -crudini --set ${config_path} llama max_batch_size 256 +crudini --set ${config_path} llama max_batch_size 128 # end of update config benchmark_rpm () { output_path=$1 - mkdir -p ${output_path} + mkdir -p "${output_path}" batches=(64 128) for batch in "${batches[@]}" @@ -40,6 +40,8 @@ benchmark_rpm () { benchmark_generation () { output_path=$1 + mkdir -p "${output_path}" + python3 benchmark/profile_generation.py \ ${turbomind_model_path} \ --concurrency 1 16 32 64 \ diff --git a/benchmark/benchmark_70b.sh b/benchmark/benchmark_70b.sh index e17a4b78bc..d43bd37c55 100755 --- a/benchmark/benchmark_70b.sh +++ b/benchmark/benchmark_70b.sh @@ -21,7 +21,7 @@ crudini --set ${config_path} llama max_batch_size 256 benchmark_rpm () { output_path=$1 - mkdir -p ${output_path} + mkdir -p "${output_path}" batches=(64 128 256) for batch in "${batches[@]}" @@ -40,6 +40,8 @@ benchmark_rpm () { benchmark_generation () { output_path=$1 + mkdir -p "${output_path}" + python3 benchmark/profile_generation.py \ ${turbomind_model_path} \ --concurrency 1 64 128 256 \ diff --git a/docs/en/benchmark/a100_fp16.md b/docs/en/benchmark/a100_fp16.md new file mode 100644 index 0000000000..4ca463b65f --- /dev/null +++ b/docs/en/benchmark/a100_fp16.md @@ -0,0 +1,130 @@ +# Benchmark on A100 (FP16) + +All the following results are tested on (x8) A100-80G CUDA 11.8. + +The tested lmdeploy version is `v0.1.0a1`. + +The commands provided below facilitate benchmarking both [static inference performance](#static-inference-benchmark) and [request throughput](#request-throughput-benchmark) on an A100-80G(x8) for models of various sizes. + +```shell +bash benchmark/benchmark-7b.sh +bash benchmark/benchmark-13b.sh +bash benchmark/benchmark-20b.sh +bash benchmark/benchmark-70b.sh +``` + +## Static Inference Benchmark + +### llama2-7b + +| batch | tp | prompt_tokens | completion_tokens | 1st_token_latency(min)(s) | 1st_token_latency(max)(s) | 1st_token_latency(ave)(s) | percentile50(s) | percentile75(s) | percentile95(s) | percentile99(s) | throughput(token/s) | mem_per_gpu(GB) | +| ----- | --- | ------------- | ----------------- | ------------------------- | ------------------------- | ------------------------- | --------------- | --------------- | --------------- | --------------- | ------------------- | --------------- | +| 1 | 1 | 1 | 128 | 0.01 | 0.011 | 0.011 | 0.009 | 0.009 | 0.01 | 0.011 | 100.02 | 76.55 | +| 1 | 1 | 128 | 128 | 0.022 | 0.022 | 0.022 | 0.01 | 0.01 | 0.01 | 0.01 | 102.21 | 76.59 | +| 1 | 1 | 128 | 2048 | 0.022 | 0.022 | 0.022 | 0.01 | 0.01 | 0.01 | 0.01 | 98.92 | 76.59 | +| 1 | 1 | 2048 | 128 | 0.139 | 0.14 | 0.139 | 0.01 | 0.01 | 0.01 | 0.011 | 86.1 | 76.77 | +| 1 | 1 | 2048 | 2048 | 0.139 | 0.141 | 0.14 | 0.011 | 0.011 | 0.011 | 0.011 | 93.78 | 76.77 | +| 16 | 1 | 1 | 128 | 0.011 | 0.031 | 0.021 | 0.01 | 0.011 | 0.011 | 0.013 | 1504.72 | 76.59 | +| 16 | 1 | 128 | 128 | 0.023 | 0.149 | 0.129 | 0.011 | 0.011 | 0.012 | 0.014 | 1272.47 | 76.77 | +| 16 | 1 | 128 | 2048 | 0.023 | 0.144 | 0.13 | 0.015 | 0.018 | 0.02 | 0.021 | 1010.62 | 76.77 | +| 16 | 1 | 2048 | 128 | 0.143 | 3.576 | 2.897 | 0.02 | 0.021 | 0.022 | 0.025 | 348.87 | 78.3 | +| 16 | 1 | 2048 | 2048 | 0.142 | 3.084 | 2.678 | 0.025 | 0.028 | 0.03 | 0.031 | 601.63 | 78.3 | +| 32 | 1 | 1 | 128 | 0.014 | 0.725 | 0.079 | 0.011 | 0.012 | 0.013 | 0.021 | 2136.73 | 76.62 | +| 32 | 1 | 128 | 128 | 0.022 | 0.359 | 0.214 | 0.012 | 0.013 | 0.014 | 0.035 | 2125.47 | 76.99 | +| 32 | 1 | 128 | 2048 | 0.026 | 0.269 | 0.2 | 0.021 | 0.026 | 0.031 | 0.033 | 1462.12 | 76.99 | +| 32 | 1 | 2048 | 128 | 0.143 | 5.267 | 4.288 | 0.031 | 0.032 | 0.034 | 0.161 | 450.43 | 78.3 | +| 32 | 1 | 2048 | 2048 | 0.19 | 5.429 | 4.118 | 0.04 | 0.045 | 0.05 | 0.053 | 733.34 | 78.34 | +| 64 | 1 | 1 | 128 | 0.013 | 0.21 | 0.042 | 0.012 | 0.018 | 0.028 | 0.041 | 4154.81 | 76.71 | +| 64 | 1 | 128 | 128 | 0.026 | 1.061 | 0.44 | 0.014 | 0.018 | 0.026 | 0.158 | 3024.07 | 77.43 | +| 64 | 1 | 128 | 2048 | 0.027 | 1.231 | 0.535 | 0.03 | 0.041 | 0.048 | 0.053 | 1852.06 | 77.96 | +| 64 | 1 | 2048 | 128 | 0.142 | 16.235 | 6.59 | 0.046 | 0.049 | 0.055 | 0.767 | 493.46 | 78.4 | +| 64 | 1 | 2048 | 2048 | 0.142 | 116.285 | 39.105 | 0.047 | 0.049 | 0.051 | 0.207 | 755.65 | 78.4 | + +### llama2-13b + +| batch | tp | prompt_tokens | completion_tokens | 1st_token_latency(min)(s) | 1st_token_latency(max)(s) | 1st_token_latency(ave)(s) | percentile50(s) | percentile75(s) | percentile95(s) | percentile99(s) | throughput(token/s) | mem_per_gpu(GB) | +| ----- | --- | ------------- | ----------------- | ------------------------- | ------------------------- | ------------------------- | --------------- | --------------- | --------------- | --------------- | ------------------- | --------------- | +| 1 | 1 | 1 | 128 | 0.018 | 0.019 | 0.018 | 0.017 | 0.017 | 0.017 | 0.017 | 57.49 | 74.84 | +| 1 | 1 | 128 | 128 | 0.039 | 0.04 | 0.04 | 0.017 | 0.017 | 0.017 | 0.018 | 56.58 | 74.84 | +| 1 | 1 | 128 | 2048 | 0.04 | 0.04 | 0.04 | 0.018 | 0.018 | 0.018 | 0.019 | 55.29 | 74.84 | +| 1 | 1 | 2048 | 128 | 0.242 | 0.243 | 0.242 | 0.019 | 0.019 | 0.019 | 0.019 | 48.99 | 75.09 | +| 1 | 1 | 2048 | 2048 | 0.24 | 0.244 | 0.243 | 0.019 | 0.019 | 0.019 | 0.02 | 52.12 | 75.09 | +| 16 | 1 | 1 | 128 | 0.019 | 0.053 | 0.036 | 0.018 | 0.019 | 0.019 | 0.02 | 869.45 | 74.87 | +| 16 | 1 | 128 | 128 | 0.041 | 0.272 | 0.252 | 0.019 | 0.02 | 0.02 | 0.021 | 757.3 | 75.09 | +| 16 | 1 | 128 | 2048 | 0.041 | 0.275 | 0.253 | 0.026 | 0.03 | 0.033 | 0.034 | 605.88 | 75.09 | +| 16 | 1 | 2048 | 128 | 0.245 | 3.668 | 3.442 | 0.033 | 0.034 | 0.035 | 0.035 | 257.92 | 76.96 | +| 16 | 1 | 2048 | 2048 | 0.249 | 3.671 | 3.122 | 0.04 | 0.044 | 0.047 | 0.047 | 366.67 | 76.99 | +| 32 | 1 | 1 | 128 | 0.021 | 0.057 | 0.034 | 0.019 | 0.02 | 0.021 | 0.023 | 1667.5 | 74.9 | +| 32 | 1 | 128 | 128 | 0.04 | 0.497 | 0.461 | 0.021 | 0.022 | 0.023 | 0.025 | 1301.27 | 75.37 | +| 32 | 1 | 128 | 2048 | 0.041 | 1.151 | 0.833 | 0.034 | 0.042 | 0.047 | 0.048 | 860.14 | 75.84 | +| 32 | 1 | 2048 | 128 | 0.245 | 13.483 | 5.315 | 0.046 | 0.047 | 0.049 | 0.51 | 291.54 | 77.02 | +| 32 | 1 | 2048 | 2048 | 0.245 | 108.104 | 38.725 | 0.047 | 0.047 | 0.049 | 0.05 | 389.64 | 77.02 | +| 64 | 1 | 1 | 128 | 0.025 | 0.073 | 0.044 | 0.02 | 0.022 | 0.026 | 0.029 | 3049.16 | 74.96 | +| 64 | 1 | 128 | 128 | 0.046 | 0.951 | 0.703 | 0.024 | 0.026 | 0.029 | 0.032 | 2033.22 | 75.87 | +| 64 | 1 | 128 | 2048 | 0.042 | 60.1 | 7.805 | 0.045 | 0.047 | 0.05 | 0.063 | 998.86 | 76.9 | +| 64 | 1 | 2048 | 128 | 0.245 | 32.394 | 19.69 | 0.047 | 0.048 | 0.05 | 0.27 | 286.32 | 76.99 | +| 64 | 1 | 2048 | 2048 | 0.245 | 307.331 | 190.453 | 0.047 | 0.048 | 0.049 | 0.05 | 387.86 | 77.09 | + +### internlm-20b + +| batch | tp | prompt_tokens | completion_tokens | 1st_token_latency(min)(s) | 1st_token_latency(max)(s) | 1st_token_latency(ave)(s) | percentile50(s) | percentile75(s) | percentile95(s) | percentile99(s) | throughput(token/s) | mem_per_gpu(GB) | +| ----- | --- | ------------- | ----------------- | ------------------------- | ------------------------- | ------------------------- | --------------- | --------------- | --------------- | --------------- | ------------------- | --------------- | +| 1 | 2 | 1 | 128 | 0.017 | 0.019 | 0.018 | 0.016 | 0.016 | 0.016 | 0.018 | 61.14 | 73.55 | +| 1 | 2 | 128 | 128 | 0.041 | 0.043 | 0.042 | 0.016 | 0.016 | 0.016 | 0.017 | 60.03 | 73.55 | +| 1 | 2 | 128 | 2048 | 0.042 | 0.043 | 0.042 | 0.017 | 0.017 | 0.018 | 0.018 | 58.26 | 73.55 | +| 1 | 2 | 2048 | 128 | 0.216 | 0.217 | 0.217 | 0.018 | 0.018 | 0.018 | 0.018 | 51.93 | 73.68 | +| 1 | 2 | 2048 | 2048 | 0.217 | 0.217 | 0.217 | 0.018 | 0.018 | 0.018 | 0.018 | 56.36 | 73.68 | +| 16 | 2 | 1 | 128 | 0.018 | 0.051 | 0.034 | 0.017 | 0.018 | 0.019 | 0.02 | 903.01 | 73.65 | +| 16 | 2 | 128 | 128 | 0.043 | 0.248 | 0.227 | 0.018 | 0.019 | 0.02 | 0.021 | 794.13 | 73.74 | +| 16 | 2 | 128 | 2048 | 0.043 | 0.25 | 0.227 | 0.024 | 0.027 | 0.029 | 0.03 | 669.87 | 73.74 | +| 16 | 2 | 2048 | 128 | 0.247 | 4.485 | 3.09 | 0.029 | 0.03 | 0.031 | 0.032 | 288.60 | 75.60 | +| 16 | 2 | 2048 | 2048 | 0.219 | 4.442 | 3.172 | 0.035 | 0.037 | 0.04 | 0.041 | 441.46 | 75.61 | +| 32 | 2 | 1 | 128 | 0.02 | 0.066 | 0.037 | 0.019 | 0.02 | 0.021 | 0.023 | 1673.64 | 73.71 | +| 32 | 2 | 128 | 128 | 0.043 | 0.436 | 0.351 | 0.02 | 0.021 | 0.023 | 0.025 | 1347.57 | 73.90 | +| 32 | 2 | 128 | 2048 | 0.042 | 0.441 | 0.391 | 0.031 | 0.037 | 0.041 | 0.043 | 1025.62 | 73.90 | +| 32 | 2 | 2048 | 128 | 0.218 | 6.3 | 6.062 | 0.042 | 0.043 | 0.045 | 0.046 | 352.45 | 75.74 | +| 32 | 2 | 2048 | 2048 | 0.222 | 70.328 | 10.36 | 0.049 | 0.05 | 0.051 | 0.053 | 514.60 | 75.77 | +| 64 | 2 | 1 | 128 | 0.029 | 0.074 | 0.05 | 0.021 | 0.023 | 0.026 | 0.03 | 2954.34 | 73.82 | +| 64 | 2 | 128 | 128 | 0.047 | 0.808 | 0.591 | 0.024 | 0.026 | 0.029 | 0.032 | 2122.92 | 74.24 | +| 64 | 2 | 128 | 2048 | 0.049 | 41.212 | 2.529 | 0.042 | 0.048 | 0.052 | 0.055 | 1276.61 | 75.18 | +| 64 | 2 | 2048 | 128 | 0.219 | 20.986 | 12.382 | 0.05 | 0.051 | 0.054 | 0.249 | 350.82 | 75.88 | +| 64 | 2 | 2048 | 2048 | 0.221 | 211.531 | 111.149 | 0.05 | 0.051 | 0.052 | 0.055 | 512.37 | 76.26 | + +### llama2-70b + +| batch | tp | prompt_tokens | completion_tokens | 1st_token_latency(min)(s) | 1st_token_latency(max)(s) | 1st_token_latency(ave)(s) | percentile50(s) | percentile75(s) | percentile95(s) | percentile99(s) | throughput(token/s) | mem_per_gpu(GB) | +| ----- | --- | ------------- | ----------------- | ------------------------- | ------------------------- | ------------------------- | --------------- | --------------- | --------------- | --------------- | ------------------- | --------------- | +| 1 | 4 | 1 | 128 | 0.03 | 0.031 | 0.031 | 0.029 | 0.029 | 0.029 | 0.03 | 33.94 | 73.72 | +| 1 | 4 | 128 | 128 | 0.073 | 0.074 | 0.074 | 0.029 | 0.029 | 0.029 | 0.03 | 33.63 | 73.72 | +| 1 | 4 | 128 | 2048 | 0.074 | 0.075 | 0.074 | 0.031 | 0.031 | 0.031 | 0.031 | 32.38 | 73.72 | +| 1 | 4 | 2048 | 128 | 0.401 | 0.403 | 0.402 | 0.031 | 0.031 | 0.031 | 0.051 | 28.32 | 73.78 | +| 1 | 4 | 2048 | 2048 | 0.402 | 0.407 | 0.405 | 0.031 | 0.031 | 0.031 | 0.031 | 31.9 | 73.78 | +| 16 | 4 | 1 | 128 | 0.034 | 0.939 | 0.071 | 0.03 | 0.031 | 0.032 | 0.251 | 468.52 | 73.72 | +| 16 | 4 | 128 | 128 | 0.08 | 0.687 | 0.437 | 0.03 | 0.031 | 0.032 | 0.207 | 439.77 | 73.81 | +| 16 | 4 | 128 | 2048 | 0.079 | 0.44 | 0.403 | 0.033 | 0.033 | 0.035 | 0.036 | 482.99 | 73.81 | +| 16 | 4 | 2048 | 128 | 0.437 | 7.612 | 5.776 | 0.035 | 0.036 | 0.036 | 0.037 | 189.34 | 73.98 | +| 16 | 4 | 2048 | 2048 | 0.411 | 6.844 | 5.773 | 0.036 | 0.037 | 0.038 | 0.041 | 399.42 | 73.98 | +| 32 | 4 | 1 | 128 | 0.043 | 0.253 | 0.098 | 0.032 | 0.033 | 0.035 | 0.178 | 906.03 | 73.75 | +| 32 | 4 | 128 | 128 | 0.078 | 1.026 | 0.749 | 0.032 | 0.033 | 0.035 | 0.438 | 746.36 | 73.91 | +| 32 | 4 | 128 | 2048 | 0.076 | 1.129 | 0.732 | 0.036 | 0.038 | 0.041 | 0.158 | 853.56 | 73.91 | +| 32 | 4 | 2048 | 128 | 0.408 | 13.321 | 11.834 | 0.04 | 0.041 | 0.043 | 0.248 | 232.6 | 73.99 | +| 32 | 4 | 2048 | 2048 | 0.409 | 12.689 | 11.711 | 0.043 | 0.045 | 0.048 | 0.179 | 636.23 | 73.99 | +| 64 | 4 | 1 | 128 | 0.046 | 1.264 | 0.213 | 0.037 | 0.039 | 0.044 | 0.329 | 1425.79 | 73.81 | +| 64 | 4 | 128 | 128 | 0.107 | 2.676 | 1.292 | 0.037 | 0.04 | 0.045 | 0.378 | 1159.84 | 73.96 | +| 64 | 4 | 128 | 2048 | 0.135 | 1.623 | 1.173 | 0.043 | 0.047 | 0.052 | 0.251 | 1391.8 | 73.95 | +| 64 | 4 | 2048 | 128 | 0.452 | 24.164 | 17.402 | 0.05 | 0.052 | 0.057 | 0.345 | 270.47 | 74.02 | +| 64 | 4 | 2048 | 2048 | 0.423 | 24.498 | 21.29 | 0.055 | 0.059 | 0.065 | 0.299 | 930.46 | 74.01 | + +## Request Throughput Benchmark + +| model_name | batch | tp | num_prompts | 1st_token_latency(min)(s) | 1st_token_latency(max)(s) | 1st_token_latency(ave)(s) | output_token thr(tokens/s) | total_token thr(tokens/s) | PRS | PRM | +| ------------ | ----- | --- | ----------- | ------------------------- | ------------------------- | ------------------------- | -------------------------- | ------------------------- | ------ | ------- | +| llama2-7b | 64 | 1 | 3000 | 0.036 | 1.145 | 0.092 | 2562.435 | 5283.547 | 10.275 | 616.477 | +| | 128 | 1 | 3000 | 0.056 | 2.241 | 0.205 | 3210.281 | 6619.357 | 12.611 | 756.677 | +| llama2-13b | 64 | 1 | 3000 | 0.051 | 2.048 | 0.159 | 1474.786 | 3039.398 | 6.337 | 380.244 | +| | 128 | 1 | 3000 | 0.085 | 4.445 | 0.412 | 1765.788 | 3639.128 | 7.588 | 455.273 | +| internlm-20b | 64 | 2 | 3000 | 0.059 | 2.461 | 0.166 | 1564.696 | 3311.16 | 7.842 | 470.516 | +| | 128 | 2 | 3000 | 0.079 | 5.808 | 0.34 | 1950.627 | 4127.855 | 9.776 | 586.568 | +| llama2-70b | 64 | 4 | 3000 | 0.083 | 4.689 | 0.301 | 1000.376 | 2062.7 | 4.285 | 257.08 | +| | 128 | 4 | 3000 | 0.107 | 8.431 | 0.633 | 1361.939 | 2808.216 | 5.833 | 349.996 | +| | 256 | 4 | 3000 | 0.171 | 19.52 | 1.49 | 1533.592 | 3162.15 | 6.568 | 394.108 |