Skip to content

Commit

Permalink
Removed obsolete parts. Made rocm attention defaults define guarded
Browse files Browse the repository at this point in the history
  • Loading branch information
gshtras committed Mar 19, 2024
1 parent 1510843 commit 3d82eea
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 618 deletions.
62 changes: 29 additions & 33 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1
ENV WORKSPACE_DIR=/workspace
RUN mkdir -p $WORKSPACE_DIR
WORKDIR $WORKSPACE_DIR
# Limit arch's so composable kernel doesn't take days to finish
ENV PYTORCH_ROCM_ARCH=gfx90a;gfx942
# default base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"

FROM $BASE_IMAGE

RUN echo "Base image is $BASE_IMAGE"

# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"


ARG FA_GFX_ARCHS="gfx90a;gfx942"
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"

Expand All @@ -22,22 +27,9 @@ ARG BUILD_CUPY="1"
# whether to build triton on rocm
ARG BUILD_TRITON="1"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y

# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
ca-certificates \
sudo \
git \
bzip2 \
libx11-6 \
build-essential \
wget \
unzip \
nvidia-cuda-toolkit \
tmux \
sqlite3 libsqlite3-dev libfmt-dev \
&& rm -rf /var/lib/apt/lists/*

### Mount Point ###
Expand All @@ -60,6 +52,8 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
Expand Down Expand Up @@ -94,27 +88,29 @@ RUN if [ "$BUILD_TRITON" = "1"]; then \
mkdir -p libs \
&& cd libs \
&& pip uninstall -y triton \
&& git clone https://github.com/ROCmSoftwarePlatform/triton.git
&& git clone https://github.com/ROCm/triton.git \
&& cd triton/python \
&& pip3 install -e . \
&& cd ../..; \
&& cd ../.. \
&& rm -r triton; \
fi

COPY ./ /app/vllm

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install xformers==0.0.23 --no-deps

RUN cd vllm \
&& pip install -r requirements-rocm.txt \
&& pip install typing-extensions==4.8.0 \
&& bash patch_xformers.rocm.sh \
&& cd gradlib && python setup.py develop && cd ../ \
&& python setup.py build && python setup.py develop; exit 0

RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3
RUN cd /app \
&& cd vllm \
&& pip install -U -r requirements-rocm.txt \
&& if [ "$BUILD_FA" = "1" ]; then \
bash patch_xformers.rocm.sh; fi \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \
&& python3 setup.py install \
&& cd ..

RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \
&& cd rocmProfileData && make; make install
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir ray[all]

WORKDIR /workspace/vllm
CMD ["/bin/bash"]
168 changes: 54 additions & 114 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,17 @@
import time
from pathlib import Path
from typing import Optional
import pandas as pd

import numpy as np
import torch
from tqdm import tqdm

from vllm import LLM, SamplingParams
from torch.profiler import profile, record_function, ProfilerActivity

def list_of_ints(arg):
return list(map(int, arg.split(',')))

def main(args: argparse.Namespace):
print(args)

print(f'>>>Loading LLM')
if args.report:
results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(
Expand All @@ -36,101 +30,60 @@ def main(args: argparse.Namespace):
ray_workers_use_nsight=args.ray_workers_use_nsight,
)

for batch_size in args.batch_size:
for output_len in args.output_len:
for input_len in args.input_len:
print(f'>>>RUNNING {args.model} Batch_size:{batch_size} Input_len:{input_len} Output_len:{output_len}')
sampling_params = SamplingParams(
n=args.n,
temperature=0.0 if args.use_beam_search else 1.0,
top_p=1.0,
use_beam_search=args.use_beam_search,
ignore_eos=True,
max_tokens=output_len,
)
print(sampling_params)
dummy_prompt_token_ids = [[0] * input_len] * batch_size
dummy_prompts = []
dummy_prompts.append('DeepSpeed is a machine learning library that deep learning practitioners should use for what purpose')

def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
elif args.accuracy:
start_time = time.perf_counter()
rsp = llm.generate(
#prompt_token_ids=dummy_prompt_token_ids,
prompts=dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
latency = end_time - start_time
print('>>Rsp', rsp[0].outputs)
return latency
else:
start_time = time.perf_counter()
rsp = llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
latency = end_time - start_time
print('>>Rsp', rsp[0].outputs)
return latency

print("Warming up...")
run_to_completion(profile_dir=None)

if (args.warmup_only):

print(">>> Warmup only specified, exiting")
continue
sampling_params = SamplingParams(
n=args.n,
temperature=0.0 if args.use_beam_search else 1.0,
top_p=1.0,
use_beam_search=args.use_beam_search,
ignore_eos=True,
max_tokens=args.output_len,
)
print(sampling_params)
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()

if args.profile:
profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = Path(
"."
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=args.profile_result_dir)
return
if args.rpd:
from rpdTracerControl import rpdTracerControl
rpdTracerControl.setFilename(name = "/workspace/trace.rpd", append=True)
profile_rpd = rpdTracerControl()
profile_rpd.start()
print(f"RPD Profiling'...")
with torch.autograd.profiler.emit_nvtx():
run_to_completion(profile_dir=None)
profile_rpd.stop()
return
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
latency = end_time - start_time
return latency

# Benchmark.
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
print("Warming up...")
run_to_completion(profile_dir=None)

if torch.distributed.get_rank() == 0:
#results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency'])
latency=np.mean(latencies)
print(f'Avg latency: {latency} seconds')
if args.report:
entry = {'model':[args.model], 'tp':[args.tensor_parallel_size],'batch':[batch_size], 'input':[input_len], 'output':[output_len], 'latency':[latency]}
results_df = pd.concat([results_df, pd.DataFrame(entry)], ignore_index=True)
if torch.distributed.get_rank() == 0 and args.report:
print(results_df)
results_df.to_csv(args.report_file, index=False)
if args.profile:
profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = Path(
"."
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir)
return

# Benchmark.
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
print(f'Avg latency: {np.mean(latencies)} seconds')


if __name__ == '__main__':
Expand All @@ -144,9 +97,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
choices=['awq', 'gptq', 'squeezellm', None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=list_of_ints, default=32)
parser.add_argument('--output-len', type=list_of_ints, default=128)
parser.add_argument('--batch-size', type=list_of_ints, default=8)
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
parser.add_argument('--n',
type=int,
default=1,
Expand All @@ -159,7 +112,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')

parser.add_argument(
'--dtype',
type=str,
Expand All @@ -172,9 +124,6 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument('--enforce-eager',
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument('--accuracy',
action='store_true',
help='Run an Actual query through vllm')
parser.add_argument(
"--kv-cache-dtype",
type=str,
Expand Down Expand Up @@ -216,14 +165,5 @@ def run_to_completion(profile_dir: Optional[str] = None):
action='store_true',
help="If specified, use nsight to profile ray workers",
)
parser.add_argument(
'--rpd',
action='store_true',
help='profile the generation process of a single batch using the rpd tracer')
parser.add_argument('--warmup-only', action='store_true',
help='only run warmup, useful for tuning')
parser.add_argument('--report', action='store_true',
help='turn on dataframe reporting')
parser.add_argument('--report-file', type=str, default=None)
args = parser.parse_args()
main(args)
9 changes: 9 additions & 0 deletions csrc/attention/attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,11 @@ template<
typename CACHE_T,
int BLOCK_SIZE,
bool IS_FP8_KV_CACHE,
#ifdef USE_ROCM
int NUM_THREADS = 1024>
#else
int NUM_THREADS = 128>
#endif
void paged_attention_v1_launcher(
torch::Tensor& out,
torch::Tensor& query,
Expand Down Expand Up @@ -810,8 +814,13 @@ template<
typename CACHE_T,
int BLOCK_SIZE,
bool IS_FP8_KV_CACHE,
#ifdef USE_ROCM
int NUM_THREADS = 128,
int PARTITION_SIZE = 512>
#else
int NUM_THREADS = 1024,
int PARTITION_SIZE = 1024>
#endif
void paged_attention_v2_launcher(
torch::Tensor& out,
torch::Tensor& exp_sums,
Expand Down
7 changes: 0 additions & 7 deletions csrc/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,6 @@ void reshape_and_cache(
const std::string& kv_cache_dtype,
const float kv_scale);

void gather_cached_kv(
torch::Tensor& key,
torch::Tensor& value,
torch::Tensor& key_cache,
torch::Tensor& value_cache,
torch::Tensor& slot_mapping);

// Just for unittest
void convert_fp8(
torch::Tensor& src_cache,
Expand Down
Loading

0 comments on commit 3d82eea

Please sign in to comment.