Merge pull request #53 from makaveli10/upgrade/tensorrt_llm_0_10_0

Upgrade to TensorRT-LLM v0.10.0.
collabora · Jun 19, 2024 · 67f82b4 · 67f82b4
2 parents d61b5a9 + 1993d76
commit 67f82b4
Show file tree

Hide file tree

Showing 18 changed files with 140 additions and 215 deletions.
diff --git a/README.md b/README.md
@@ -41,14 +41,13 @@ The demo was run on a single RTX 4090 GPU. WhisperFusion uses the Nvidia TensorR
 ## Getting Started
 We provide a Docker Compose setup to streamline the deployment of the pre-built TensorRT-LLM docker container. This setup includes both Whisper and Phi converted to TensorRT engines, and the WhisperSpeech model is pre-downloaded to quickly start interacting with WhisperFusion. Additionally, we include a simple web server for the Web GUI.
 
-- Build and Run with docker compose for RTX 3090 and RTX
+- Build and Run with docker compose
 ```bash
 mkdir docker/scratch-space
 cp docker/scripts/build-* docker/scripts/run-whisperfusion.sh docker/scratch-space/
 
-# Set the CUDA_ARCH environment variable based on your GPU
-# Use '86-real' for RTX 3090, '89-real' for RTX 4090
-CUDA_ARCH=86-real docker compose build
+docker compose build
+export MODEL=Phi-3-mini-4k-instruct    #Phi-3-mini-128k-instruct or phi-2, By default WhisperFusion uses phi-2
 docker compose up
 ```
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,19 +1,16 @@
-version: '3.8'
-
 services:
   whisperfusion:
     build:
       context: docker
       dockerfile: Dockerfile
-      args:
-        CUDA_ARCH: ${CUDA_ARCH:-89-real;90-real}
     image: whisperfusion:latest
     volumes:
       - type: bind
         source: ./docker/scratch-space
         target: /root/scratch-space
     environment:
       VERBOSE: ${VERBOSE:-false}
+      MODEL: ${MODEL:-Phi-3-mini-4k-instruct}
     ports:
       - "8888:8888"
       - "6006:6006"

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,27 +1,23 @@
 ARG BASE_IMAGE=nvcr.io/nvidia/cuda
-ARG BASE_TAG=12.2.2-devel-ubuntu22.04
+ARG BASE_TAG=12.4.0-runtime-ubuntu22.04
 
 FROM ${BASE_IMAGE}:${BASE_TAG} as base
 ARG CUDA_ARCH
 ENV CUDA_ARCH=${CUDA_ARCH}
 
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    xz-utils \
-    curl \
-    git && \
+RUN apt-get update && apt-get install -y \
+    python3.10 python3-pip openmpi-bin libopenmpi-dev git wget \
+    xz-utils curl && \
     rm -rf /var/lib/apt/lists/*
 
 FROM base as devel
-WORKDIR /root
-COPY scripts/install-deps.sh /root
-RUN bash install-deps.sh && rm install-deps.sh
-COPY scripts/build-trt-llm.sh /root
-RUN bash build-trt-llm.sh && rm build-trt-llm.sh
+WORKDIR /root/
+RUN pip3 install --no-cache-dir -U tensorrt_llm==0.10.0 --extra-index-url https://pypi.nvidia.com
+RUN git clone -b v0.10.0 --depth 1 https://github.com/NVIDIA/TensorRT-LLM.git && \
+    mv TensorRT-LLM/examples ./TensorRT-LLM-examples && \
+    rm -rf TensorRT-LLM
 
 FROM devel as release
 WORKDIR /root/
-COPY scripts/install-trt-llm.sh /root
-RUN bash install-trt-llm.sh && rm install-trt-llm.sh
 COPY scripts/setup-whisperfusion.sh /root/
 RUN ./setup-whisperfusion.sh
diff --git a/docker/scripts/build-models.sh b/docker/scripts/build-models.sh
@@ -9,9 +9,9 @@ else
     echo "whisper_small_en directory exists and is not empty. Skipping build-whisper.sh..."
 fi
 # ./build-mistral.sh
-if [ ! -d "/root/scratch-space/models/dolphin-2_6-phi-2" ] || [ -z "$(ls -A /root/scratch-space/models/dolphin-2_6-phi-2)" ]; then
-    echo "dolphin-2_6-phi-2 directory does not exist or is empty. Running build-dolphin-2_6-phi-2.sh..."
-    ./build-dolphin-2_6-phi-2.sh
+if [ ! -d "/root/scratch-space/models/$1" ] || [ -z "$(ls -A /root/scratch-space/models/$1)" ]; then
+    echo "$1 directory does not exist or is empty. Running build-phi.sh..."
+    ./build-phi.sh $1
 else
-    echo "dolphin-2_6-phi-2 directory exists and is not empty. Skipping build-dolphin-2_6-phi-2.sh..."
+    echo "$1 directory exists and is not empty. Skipping build-phi.sh..."
 fi
diff --git a/docker/scripts/build-phi-2.sh b/docker/scripts/build-phi-2.sh
diff --git a/docker/scripts/build-phi.sh b/docker/scripts/build-phi.sh
@@ -0,0 +1,38 @@
+#!/bin/bash -e
+
+## Note: Phi is only available in main branch and hasnt been released yet. So, make sure to build TensorRT-LLM from main branch.
+
+cd /root/TensorRT-LLM-examples/phi
+
+## Build TensorRT for Phi-2 with `fp16`
+
+MODEL_TYPE=$1
+echo "Download $MODEL_TYPE Huggingface models..."
+
+phi_path=$(huggingface-cli download --repo-type model microsoft/$MODEL_TYPE)
+echo "Building  TensorRT Engine..."
+name=$1
+pip install -r requirements.txt
+
+python3 ./convert_checkpoint.py --model_type $MODEL_TYPE \
+                    --model_dir $phi_path \
+                    --output_dir ./phi-checkpoint \
+                    --dtype float16
+
+trtllm-build \
+    --checkpoint_dir ./phi-checkpoint \
+    --output_dir $name \
+    --gpt_attention_plugin float16 \
+    --context_fmha enable \
+    --gemm_plugin float16 \
+    --max_batch_size 1 \
+    --max_input_len 1024 \
+    --max_output_len 1024 \
+    --tp_size 1 \
+    --pp_size 1
+
+dest=/root/scratch-space/models
+mkdir -p "$dest/$name/tokenizer"
+cp -r "$name" "$dest"
+(cd "$phi_path" && cp config.json tokenizer_config.json tokenizer.json special_tokens_map.json added_tokens.json vocab.json merges.txt "$dest/$name/tokenizer")
+cp -r "$phi_path" "$dest/phi-orig-model"
diff --git a/docker/scripts/build-trt-llm.sh b/docker/scripts/build-trt-llm.sh
diff --git a/docker/scripts/build-whisper.sh b/docker/scripts/build-whisper.sh
@@ -3,16 +3,6 @@
 ## Change working dir to the [whisper example dir](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/whisper) in TensorRT-LLM.
 cd /root/TensorRT-LLM-examples/whisper
 
-## Currently, by default TensorRT-LLM only supports `large-v2` and `large-v3`. In this repo, we use `small.en`.
-## Download the required assets
-
-if [ ! -f assets/mel_filters.npz ]; then
-    echo "Downloading mel filter definitions"
-    wget --directory-prefix=assets https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz > /dev/null 2>&1
-else
-    echo "Mel filter definitions already exist, skipping download."
-fi
-
 # the small.en model weights
 if [ ! -f assets/small.en.pt ]; then
     echo "Downloading PyTorch weights for small.en model"
@@ -24,7 +14,8 @@ fi
 echo "Building Whisper TensorRT Engine..."
 pip install -r requirements.txt > /dev/null 2>&1
 
-python3 build.py --output_dir whisper_small_en --use_gpt_attention_plugin --use_gemm_plugin --use_layernorm_plugin  --use_bert_attention_plugin --model_name small.en > /dev/null 2>&1
+python3 build.py --output_dir whisper_small_en --use_gpt_attention_plugin --use_gemm_plugin  --use_bert_attention_plugin --enable_context_fmha --model_name small.en
 
 mkdir -p /root/scratch-space/models
 cp -r whisper_small_en /root/scratch-space/models
+rm -rf whisper_small_en
diff --git a/docker/scripts/install-deps.sh b/docker/scripts/install-deps.sh
diff --git a/docker/scripts/install-trt-llm.sh b/docker/scripts/install-trt-llm.sh
diff --git a/docker/scripts/run-whisperfusion.sh b/docker/scripts/run-whisperfusion.sh
@@ -4,14 +4,16 @@ test -f /etc/shinit_v2 && source /etc/shinit_v2
 
 echo "Running build-models.sh..."
 cd /root/scratch-space/
-./build-models.sh
+MODEL=${MODEL:-phi-2}
+./build-models.sh $MODEL
 
 cd /root/WhisperFusion
 if [ "$1" != "mistral" ]; then
   exec python3 main.py --phi \
                   --whisper_tensorrt_path /root/scratch-space/models/whisper_small_en \
-                  --phi_tensorrt_path /root/scratch-space/models/dolphin-2_6-phi-2 \
-                  --phi_tokenizer_path /root/scratch-space/models/dolphin-2_6-phi-2/tokenizer
+                  --phi_tensorrt_path /root/scratch-space/models/$MODEL \
+                  --phi_tokenizer_path /root/scratch-space/models/$MODEL/tokenizer \
+                  --phi_model_type $MODEL
 else
   exec python3 main.py --mistral \
                   --whisper_tensorrt_path /root/scratch-space/models/whisper_small_en \

diff --git a/docker/scripts/setup-whisperfusion.sh b/docker/scripts/setup-whisperfusion.sh
@@ -1,20 +1,17 @@
 #!/bin/bash -e
 
 ## Clone this repo and install requirements
-[ -d "WhisperFusion" ] || git clone -b optimize_docker https://github.com/makaveli10/WhisperFusion.git
+[ -d "WhisperFusion" ] || git clone https://github.com/Collabora/WhisperFusion.git
 
 cd WhisperFusion
 apt update
 apt install ffmpeg portaudio19-dev -y
 
-## Install torchaudio matching the PyTorch from the base image
-pip install --extra-index-url https://download.pytorch.org/whl/cu121 torchaudio==2.1.0
-
 ## Install all the other dependencies normally
 pip install -r requirements.txt
 
 ## force update huggingface_hub (tokenizers 0.14.1 spuriously require and ancient <=0.18 version)
-pip install -U huggingface_hub
+pip install -U huggingface_hub tokenizers
 
 huggingface-cli download collabora/whisperspeech t2s-small-en+pl.model s2a-q4-tiny-en+pl.model
 huggingface-cli download charactr/vocos-encodec-24khz
@@ -24,4 +21,4 @@ curl -L -o /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th https://
 mkdir -p /root/.cache/whisper-live/
 curl -L -o /root/.cache/whisper-live/silero_vad.onnx https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
 
-python -c 'from transformers.utils.hub import move_cache; move_cache()'
+python3 -c 'from transformers.utils.hub import move_cache; move_cache()'