Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text gen inference #22

Draft
wants to merge 16 commits into
base: custom_generate
Choose a base branch
from
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
!src
!scripts
!transformers
!text-generation-inference
!requirements.txt
!Makefile
20 changes: 19 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,35 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \
&& chown $USERNAME /app

# git-lfs is needed to interact with the huggingface hub
# ssl and gcc are needed for text-gen-inference
RUN apt-get update \
&& apt-get install git-lfs \
&& apt-get install git-lfs libssl-dev gcc \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install


RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
&& PROTOC_ZIP=protoc-21.12-linux-x86_64.zip \
&& curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP \
&& unzip -o $PROTOC_ZIP -d /usr/local bin/protoc \
&& unzip -o $PROTOC_ZIP -d /usr/local 'include/*' \
&& rm -f $PROTOC_ZIP \
&& chmod 777 /root/ && chmod 777 /root/.cargo

ENV PATH="/root/.cargo/bin:$PATH"

COPY --chown=$USERNAME text-generation-inference/ ./text-generation-inference

RUN cd text-generation-inference && make install && make install-benchmark && cd ..

COPY --chown=$USERNAME ./requirements.txt ./
COPY --chown=$USERNAME transformers/ ./transformers

# Stock version of pip doesn't work with editable transformers.
RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir

ENV HUGGINGFACE_HUB_CACHE=/app/data/.hf_cache/

COPY --chown=$USERNAME Makefile .
COPY --chown=$USERNAME src/ ./src
COPY --chown=$USERNAME scripts/ ./scripts
16 changes: 8 additions & 8 deletions scripts/run_all_benchmark_breakdown.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1 v2_

# Large model
./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 11 0 v2_
./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 11 0 v2_
./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 32 8190 11 0 v2_
./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 11 0 v2_# OOM?
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 11 0 v2_
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 11 0 v2_
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 11 0 v2_
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 11 0 v2_ # OOM?

./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 1 8190 29 1 v2_ 1
./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 8 8190 29 1 v2_ 1
./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 32 8190 29 1 v2_ 1
./scripts/run_benchmark_breakdown.sh large_model ./data/large-model 256 8190 29 1 v2_ 1 # OOM?
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 29 1 v2_ 1
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 29 1 v2_ 1
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 29 1 v2_ 1
./scripts/run_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 29 1 v2_ 1 # OOM?
20 changes: 20 additions & 0 deletions scripts/run_all_textgen_benchmark_breakdown.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

# Santacoder
./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 5 0
./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0
./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 5 0

./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 1 2040 11 1
./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1
./scripts/run_textgen_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 256 2040 11 1

# Large model
./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 11 0
./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 11 0
./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 11 0
./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 11 0 # OOM?

./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 1 8190 29 1 1
./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 8 8190 29 1 1
./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 32 8190 29 1 1
./scripts/run_textgen_benchmark_breakdown.sh starcoder ./data/bigcode_large-model 256 8190 29 1 1 # OOM?
4 changes: 3 additions & 1 deletion scripts/run_benchmark_breakdown.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ run () { # run(step, runtime, attn)
then
echo "Skipping existing $FILE_NAME"
else
$RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save="$FILE_NAME"
CMD="$RUN $COMMON ${RUNTIME[$2]} ${ATTN[$3]} ${STEP[$1]} --save=$FILE_NAME"
echo "$CMD"
$CMD
fi
}

Expand Down
50 changes: 50 additions & 0 deletions scripts/run_textgen_benchmark_breakdown.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

# Santacoder prefill.
# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 5 0
# Santacoder decode (fewer data points because slower)
# ./scripts/run_benchmark_breakdown.sh santacoder bigcode/gpt_bigcode-santacoder 32 2040 11 1
MODEL_NAME=${1:-"santacoder"}
MODEL_PATH=${2:-"bigcode/gpt_bigcode-santacoder"}
BATCH_SIZE=${3:-32}
MAX_NEW_TOKENS=${4:-2040}
# Prime number to see key length padding effect.
TOKEN_STEP=${5:-5}
STEP_ID=${6:-""}
CYCLES=${7:-10}

SAVE_DIR=data/benchmarks/v5
RUN="python3 -m src.main --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda --custom_generate --breakdown_latency --ignore_oom --no_fast_init "


IMPL=("flash" "causal" "vector" "bigcode" "bigcode2" "bigcode3")


STEP=("" "--no_cache")
STEP_NAME=("decode" "prefill")

COMMON="--pretrained_model=$MODEL_PATH --tokenizer=$MODEL_PATH --cycles=$CYCLES --max_input_length=1 --max_new_tokens=$MAX_NEW_TOKENS --key_length_step=$TOKEN_STEP --batch_size=$BATCH_SIZE"

run () { # run(step, runtime, attn)
FILE_NAME="$SAVE_DIR"/"$MODEL_NAME"_bs_"$BATCH_SIZE"_tok_"$MAX_NEW_TOKENS"_"${STEP_NAME[$1]}"_step_"$TOKEN_STEP"_"$CYCLES"/"${IMPL[$2]}".json
if [ -f "$FILE_NAME" ];
then
echo "Skipping existing $FILE_NAME"
else
export MODEL_TYPE="${IMPL[$2]}"
CMD="$RUN $COMMON ${STEP[$1]} --save=$FILE_NAME"
echo "MODEL_TYPE=${IMPL[$2]} $CMD"
$CMD
fi
}

for impl in {0..5}
do
if [ "${STEP_ID}" -eq "0" ]
then
# Decode (default attn only)
run 0 $impl
else
# Prefill
run 1 $impl
fi
done
12 changes: 8 additions & 4 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def get_arg_parser() -> ArgumentParser:
parser.add_argument("--max_log_outputs", type=int)
parser.add_argument("--breakdown_latency", "--bl", action="store_true")
parser.add_argument("--profile", "-p", action="store_true")
parser.add_argument("--profile_cpu", "--pcpu", action="store_true")
parser.add_argument("--profile_cycles", "--pc", type=int)
parser.add_argument("--full_trace", "--pt", action="store_true")
parser.add_argument("--show_op_names", "--pn", action="store_true")
Expand Down Expand Up @@ -108,13 +109,16 @@ def main(argv: Optional[List[str]] = None) -> None:

all_metrics = []

if args.profile:
profile = args.profile or args.profile_cpu

if profile:
profiler = get_profiler(
skip=args.skip + pre_warmup_cycles,
warmup=warmup,
cycles=post_warmup_cycles,
full_trace=args.full_trace,
show_op_names=args.show_op_names,
cpu=args.profile_cpu,
)
else:
profiler = contextlib.nullcontext()
Expand All @@ -125,7 +129,7 @@ def main(argv: Optional[List[str]] = None) -> None:
"Cycles (warmup)": args.skip + warmup,
"Cycles (benchmark)": args.cycles,
}
if args.profile:
if profile:
benchmark_metrics["Cycles (profile)"] = post_warmup_cycles
benchmark_metrics["Cycles (total)"] = args.skip + warmup + pre_warmup_cycles + post_warmup_cycles

Expand Down Expand Up @@ -158,7 +162,7 @@ def main(argv: Optional[List[str]] = None) -> None:
ignore_oom=args.ignore_oom,
pad_generated_tokens=args.pad_generated_tokens,
)
if args.profile:
if profile:
p.step()

if step == 0:
Expand All @@ -179,10 +183,10 @@ def main(argv: Optional[List[str]] = None) -> None:
benchmark_metrics[Metrics.MEMORY_RESERVED_MAX] = torch.cuda.max_memory_reserved()

t3 = time.perf_counter()
benchmark_metrics[Metrics.RUNTIME_BENCHMARK] = t3 - t2
benchmark_metrics[Metrics.RUNTIME_TOTAL] = t3 - t0

if len(all_metrics) > 0:
benchmark_metrics[Metrics.RUNTIME_BENCHMARK] = t3 - t2
benchmark_metrics.update(pipeline.aggregate_metrics(all_metrics))

benchmark_metrics = Metrics.reorder_metrics(benchmark_metrics)
Expand Down
20 changes: 11 additions & 9 deletions src/parse_breakdown_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from argparse import ArgumentParser
from pathlib import Path
from typing import List, Optional
import matplotlib.pyplot as plt
import pandas as pd


def get_arg_parser() -> ArgumentParser:
Expand All @@ -10,6 +12,7 @@ def get_arg_parser() -> ArgumentParser:
parser.add_argument("--title")
parser.add_argument("--size", nargs=2, type=float)
parser.add_argument("--save_dir", "--save", type=Path)
parser.add_argument("--rolling", "-r", type=int)
return parser


Expand All @@ -24,20 +27,19 @@ def read_data(input_file: Path):
return data


def plot(data, title=None, size=None):
import matplotlib.pyplot as plt

def plot(data, title=None, size=None, rolling=None):
fig = plt.figure(figsize=size)
ax = fig.add_subplot()

cmap = plt.get_cmap("tab20").colors
cmap = cmap[::2] + cmap[1::2]

for i, dat in enumerate(data):
latency_data = dat["Latency (generate breakdown)"]
latency_data = pd.Series({int(k): v * 1000 for k, v in dat["Latency (generate breakdown)"].items()})
if rolling is not None:
latency_data = latency_data.rolling(rolling, center=True, min_periods=1).mean()
ax.plot(
[int(k) for k in latency_data.keys()],
[v * 1000 for v in latency_data.values()],
latency_data,
label=dat["Setting"],
linewidth=1,
color=cmap[i],
Expand All @@ -62,12 +64,12 @@ def main(argv: Optional[List[str]] = None) -> None:
dirname = args.input_dir.stem
if title is None:
try:
name, _, bs, _, _, _, _, step = dirname.rsplit("_", 7)
title = f"{name} {step}, bs = {bs}"
name, _, bs, _, _, _, _, step, cycles = dirname.rsplit("_", 8)
title = f"{name}, bs = {bs} (s={step}, c={cycles})"
except ValueError:
title = dirname

fig = plot(data, title, args.size)
fig = plot(data, title, args.size, args.rolling)
fig.show()
if args.save_dir:
save_path = (args.save_dir / dirname).with_suffix(".jpg")
Expand Down
Loading