forked from goliaro/specinfer-ae
-
Notifications
You must be signed in to change notification settings - Fork 0
/
offloading_experiments.sh
executable file
·40 lines (31 loc) · 1.66 KB
/
offloading_experiments.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#! /usr/bin/env bash
set -e
set -x
# Cd into directory holding this script
cd "${BASH_SOURCE[0]%/*}"
export UCX_DIR="$PWD/ucx-1.15.0/install"
export PATH=$UCX_DIR/bin:$PATH
export LD_LIBRARY_PATH=$UCX_DIR/lib:$LD_LIBRARY_PATH
./download_dataset.sh
./download_models.sh
batch_sizes=( 1 2 4 8 16 )
mkdir -p ./FlexFlow/inference/output
rm -rf ./FlexFlow/inference/output/offloading_* || true
start_time=$(date +%s)
# single node, single GPU
ncpus=8
ngpus=1
llm_model_name="facebook/opt-13b"
ssm_model_name="facebook/opt-125m"
for bs in "${batch_sizes[@]}"
do
./FlexFlow/build/inference/spec_infer/spec_infer -ll:cpu 8 -ll:util 8 -ll:gpu 1 -ll:fsize 21000 -ll:zsize 80000 -llm-model $llm_model_name -ssm-model $ssm_model_name -prompt ./FlexFlow/inference/prompt/chatgpt_offloading.json --max-requests-per-batch $bs --expansion-degree -1 -offload -offload-reserve-space-size 500 --max-sequence-length 256 -output-file ./FlexFlow/inference/output/offloading_small_${bs}.txt > ./FlexFlow/inference/output/offloading_small_${bs}.out
done
llm_model_name="facebook/opt-30b"
for bs in "${batch_sizes[@]}"
do
./FlexFlow/build/inference/spec_infer/spec_infer -ll:cpu 8 -ll:util 8 -ll:gpu 1 -ll:fsize 21000 -ll:zsize 80000 -llm-model $llm_model_name -ssm-model $ssm_model_name -prompt ./FlexFlow/inference/prompt/chatgpt_offloading.json --max-requests-per-batch $bs --expansion-degree -1 -offload -offload-reserve-space-size 700 --max-sequence-length 256 -output-file ./FlexFlow/inference/output/offloading_large_${bs}.txt > ./FlexFlow/inference/output/offloading_large_${bs}.out
done
end_time=$(date +%s)
execution_time=$((end_time - start_time))
echo "Total offload test time: $execution_time seconds"