triton-inference-server · nnshah1 · Sep 26, 2024 · Sep 17, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/Popular_Models_Guide/StableDiffusion/README.md b/Popular_Models_Guide/StableDiffusion/README.md
@@ -29,7 +29,7 @@
 # Deploying Stable Diffusion Models with Triton and TensorRT
 
 This example demonstrates how to deploy Stable Diffusion models in
-Triton by leveraging the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion)
+Triton by leveraging the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion)
 pipeline and utilities.
 
 Using the TensorRT demo as a base this example contains a reusable
@@ -38,9 +38,9 @@ suitable for deploying multiple versions and configurations of
 Diffusion models.
 
 For more information on Stable Diffusion please visit
-[stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5),
-[stable-diffusion-xl](https://huggingface.co/docs/diffusers/en/using-diffusers/sdxl). For
-more information on the TensorRT implementation please see the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion).
+[stable-diffusion-v1-5](https://huggingface.co/benjamin-paine/stable-diffusion-v1-5),
+[stable-diffusion-xl](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0). For
+more information on the TensorRT implementation please see the [TensorRT demo](https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion).
 
 > [!Note]
 > This example is given as sample code and should be reviewed before use in production settings.
@@ -57,7 +57,7 @@ support matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/i
 ## Building the Triton Inference Server Image
 
 The example is designed based on the
-`nvcr.io/nvidia/tritonserver:24.01-py3` docker image and [TensorRT OSS v9.2.0](https://github.com/NVIDIA/TensorRT/releases/tag/v9.2.0).
+`nvcr.io/nvidia/tritonserver:24.08-py3` docker image and [TensorRT OSS v10.4](https://github.com/NVIDIA/TensorRT/releases/tag/v10.4).
 
 A set of convenience scripts are provided to create a docker image
 based on the `nvcr.io/nvidia/tritonserver:24.01-py3` image with the
@@ -99,6 +99,15 @@ directory as `workspace`.
 
 ### Build Stable Diffusion v 1.5 Engine
 
+> [!Note]
+>
+> The model
+> [stable-diffusion-v1-5](https://huggingface.co/benjamin-paine/stable-diffusion-v1-5)
+> requires login in to huggingface and acceptance of terms and
+> conditions of use. Please set the environment variable HF_TOKEN
+> accordingly.
+>
+
 ```bash
 ./scripts/build_models.sh --model stable_diffusion_1_5
 ```
@@ -285,27 +294,13 @@ python3 client.py --model stable_diffusion_xl --requests 10 --clients 10
 
 ## Known Issues and Limitations
 
-1. When shutting down the server, an invalid memory operation occurs:
-
-   > [!Note]
-   > This error is also seen in standalone applications outside of the Triton Inference Server
-   > and we believe this is due to an interaction between imported python modules. Further
-   > we haven't seen any issues related to this error and believe it can be safely
-   > ignored.
-
-
-   ```
-   free(): invalid pointer
-   ```
-
-
-2. The diffusion backend doesn't yet support using an optional refiner
+1. The diffusion backend doesn't yet support using an optional refiner
    model unlike the [demo][demo_reference] it's based on. See also
    [demo_txt2img_xl.py][demo_code]
 
 
-[demo_code]: https://github.com/NVIDIA/TensorRT/blob/release/9.2/demo/Diffusion/demo_txt2img_xl.py
+[demo_code]: https://github.com/NVIDIA/TensorRT/blob/release/10.4/demo/Diffusion/demo_txt2img_xl.py
 
 
-[demo_reference]: https://github.com/NVIDIA/TensorRT/tree/release/9.2/demo/Diffusion#text-to-image-using-sdxl-stable-diffusion-xl
+[demo_reference]: https://github.com/NVIDIA/TensorRT/tree/release/10.4/demo/Diffusion#generate-an-image-with-stable-diffusion-xl-guided-by-a-single-text-prompt
 
diff --git a/Popular_Models_Guide/StableDiffusion/build.sh b/Popular_Models_Guide/StableDiffusion/build.sh
@@ -39,7 +39,7 @@ DOCKERFILE=${SOURCE_DIR}/docker/Dockerfile
 
 # Base Images
 BASE_IMAGE=nvcr.io/nvidia/tritonserver
-BASE_IMAGE_TAG_DIFFUSION=24.01-py3
+BASE_IMAGE_TAG_DIFFUSION=24.08-py3
 
 get_options() {
     while :; do
@@ -141,7 +141,7 @@ get_options() {
     fi
 
     if [ -z "$TAG" ]; then
-        TAG="tritonserver:r24.01"
+        TAG="tritonserver:r24.08"
 
 	if [[ $FRAMEWORK == "DIFFUSION" ]]; then
 	    TAG+="-diffusion"
@@ -211,7 +211,7 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then
 	set -x
     fi
     $RUN_PREFIX mkdir -p $PWD/backend/diffusion
-    $RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion"
+    $RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR}:/workspace $TAG /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion"
 
     { set +x; } 2>/dev/null
 
@@ -221,7 +221,7 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then
 	    set -x
 	fi
 
-	$RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "/workspace/scripts/build_models.sh --model $model"
+	$RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR):/workspace $TAG /bin/bash -c "/workspace/scripts/build_models.sh --model $model"
 
 	{ set +x; } 2>/dev/null
     done

diff --git a/Popular_Models_Guide/StableDiffusion/docker/Dockerfile b/Popular_Models_Guide/StableDiffusion/docker/Dockerfile
@@ -29,9 +29,9 @@ ARG BASE_IMAGE_TAG=24.01-py3
 
 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as tritonserver-stable-diffusion
 
-RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt==9.2.0.post12.dev5
+RUN pip install --pre --upgrade --extra-index-url https://pypi.nvidia.com tensorrt-cu12==10.4.0
 
-RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/9.2 --single-branch /tmp/TensorRT
+RUN git clone https://github.com/NVIDIA/TensorRT.git -b release/10.4 --single-branch /tmp/TensorRT
 
 RUN pip3 install -r /tmp/TensorRT/demo/Diffusion/requirements.txt
 

diff --git a/Popular_Models_Guide/StableDiffusion/run.sh b/Popular_Models_Guide/StableDiffusion/run.sh
@@ -99,7 +99,7 @@ get_options() {
     fi
 
     if [ -z "$IMAGE" ]; then
-        IMAGE="tritonserver:r24.01"
+        IMAGE="tritonserver:r24.08"
 
 	if [[ $FRAMEWORK == "DIFFUSION" ]]; then
 	    IMAGE+="-diffusion"

diff --git a/Triton_Inference_Server_Python_API/README.md b/Triton_Inference_Server_Python_API/README.md
@@ -54,30 +54,30 @@ https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
 ## Installation
 
 The tutorial and Python API package are designed to be installed and
-run within the `nvcr.io/nvidia/tritonserver:24.01-py3` docker image.
+run within the `nvcr.io/nvidia/tritonserver:24.08-py3` docker image.
 
 A set of convenience scripts are provided to create a docker image
-based on the `nvcr.io/nvidia/tritonserver:24.01-py3` image with the
+based on the `nvcr.io/nvidia/tritonserver:24.08-py3` image with the
 Python API installed plus additional dependencies required for the
 examples.
 
-### Triton Inference Server 24.01 + Python API
+### Triton Inference Server 24.08 + Python API
 
 #### Clone Repository
 ```bash
 git clone https://github.com/triton-inference-server/tutorials.git
 cd tutorials/Triton_Inference_Server_Python_API
 ```
 
-#### Build `triton-python-api:r24.01` Image
+#### Build `triton-python-api:r24.08` Image
 ```bash
 ./build.sh
 ```
 
 #### Supported Backends
 
 The built image includes all the backends shipped by default in the
-tritonserver `nvcr.io/nvidia/tritonserver:24.01-py3` container.
+tritonserver `nvcr.io/nvidia/tritonserver:24.08-py3` container.
 
 ```
 dali  fil  identity  onnxruntime  openvino  python  pytorch  repeat  square  tensorflow  tensorrt
@@ -95,7 +95,7 @@ different data types. The `identity` model copies provided inputs of
 
 ## Hello World
 
-### Start `triton-python-api:r24.01` Container
+### Start `triton-python-api:r24.08` Container
 
 The following command starts a container and volume mounts the current
 directory as `workspace`.
@@ -163,7 +163,7 @@ This example is based on the
 tutorial.
 
 
-#### Build `triton-python-api:r24.01-diffusion` Image and Stable Diffusion Models
+#### Build `triton-python-api:r24.08-diffusion` Image and Stable Diffusion Models
 
 Please note the following command will take many minutes depending on
 your hardware configuration and network connection.
@@ -175,7 +175,7 @@ your hardware configuration and network connection.
 #### Supported Backends
 
 The built image includes all the backends shipped by default in the
-tritonserver `nvcr.io/nvidia/tritonserver:24.01-py3` container.
+tritonserver `nvcr.io/nvidia/tritonserver:24.08-py3` container.
 
 ```
 dali  fil  identity  onnxruntime  openvino  python  pytorch  repeat  square  tensorflow  tensorrt
@@ -223,13 +223,13 @@ server.models()
 
 #### Example Output
 ```python
-{('stable_diffusion', 1): {'name': 'stable_diffusion', 'version': 1, 'state': 'READY'}, ('text_encoder', 1): {'name': 'text_encoder', 'version': 1, 'state': 'READY'}, ('vae', 1): {'name': 'vae', 'version': 1, 'state': 'READY'}}
+{('stable_diffusion_1_5', 1): {'name': 'stable_diffusion_1_5', 'version': 1, 'state': 'READY'}, ('stable_diffusion_xl', 1): {'name': 'stable_diffusion_xl', 'version': 1, 'state': 'READY'}}
 ```
 
 ### Send an Inference Request
 
 ```python
-model = server.model("stable_diffusion")
+model = server.model("stable_diffusion_xl")
 responses = model.infer(inputs={"prompt":[["butterfly in new york, realistic, 4k, photograph"]]})
 ```
 

diff --git a/Triton_Inference_Server_Python_API/build.sh b/Triton_Inference_Server_Python_API/build.sh
@@ -30,7 +30,7 @@ RUN_PREFIX=
 BUILD_MODELS=
 
 # Frameworks
-declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["TRT_LLM"]=2 ["IDENTITY"]=3)
+declare -A FRAMEWORKS=(["DIFFUSION"]=1 ["IDENTITY"]=3)
 DEFAULT_FRAMEWORK=IDENTITY
 
 SOURCE_DIR=$(dirname "$(readlink -f "$0")")
@@ -39,9 +39,8 @@ DOCKERFILE=${SOURCE_DIR}/docker/Dockerfile
 
 # Base Images
 BASE_IMAGE=nvcr.io/nvidia/tritonserver
-BASE_IMAGE_TAG_IDENTITY=24.01-py3
-BASE_IMAGE_TAG_DIFFUSION=24.01-py3
-BASE_IMAGE_TAG_TRT_LLM=24.01-trtllm-python-py3
+BASE_IMAGE_TAG_IDENTITY=24.08-py3
+BASE_IMAGE_TAG_DIFFUSION=24.08-py3
 
 get_options() {
     while :; do
@@ -138,11 +137,7 @@ get_options() {
     fi
 
     if [ -z "$TAG" ]; then
-        TAG="triton-python-api:r24.01"
-
-	if [[ $FRAMEWORK == "TRT_LLM" ]]; then
-	    TAG+="-trt-llm"
-	fi
+        TAG="triton-python-api:r24.08"
 
 	if [[ $FRAMEWORK == "DIFFUSION" ]]; then
 	    TAG+="-diffusion"
@@ -186,7 +181,7 @@ get_options "$@"
 
 if [[ $FRAMEWORK == DIFFUSION ]]; then
     BASE_IMAGE="tritonserver"
-    BASE_IMAGE_TAG="r24.01-diffusion"
+    BASE_IMAGE_TAG="r24.08-diffusion"
 fi
 
 # BUILD RUN TIME IMAGE
@@ -207,17 +202,18 @@ if [[ $FRAMEWORK == DIFFUSION ]]; then
     if [ -z "$RUN_PREFIX" ]; then
 	set -x
     fi
-    $RUN_PREFIX mkdir -p backend/diffusion
-    $RUN_PREFIX $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/build.sh --framework diffusion --tag tritonserver:r24.01-diffusion
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/backend/diffusion/model.py backend/diffusion/model.py
-    $RUN_PREFIX mkdir -p diffusion-models/stable_diffusion_1_5/1
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/config.pbtxt  diffusion-models/stable_diffusion_1_5/config.pbtxt
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep  diffusion-models/stable_diffusion_1_5/1/.gitkeep
-    $RUN_PREFIX mkdir -p diffusion-models/stable_diffusion_xl/1
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/config.pbtxt  diffusion-models/stable_diffusion_xl/config.pbtxt
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep  diffusion-models/stable_diffusion_xl/1/.gitkeep
-    $RUN_PREFIX mkdir -p scripts/stable_diffusion
-    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/scripts/build_models* scripts/stable_diffusion/
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/backend/diffusion
+    $RUN_PREFIX $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/build.sh --framework diffusion --tag tritonserver:r24.08-diffusion
+    $RUN_PREFIX docker run --rm -it -v ${SOURCE_DIR}:/workspace tritonserver:r24.08-diffusion /bin/bash -c "cp -rf /tmp/TensorRT/demo/Diffusion /workspace/backend/diffusion"
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/backend/diffusion/model.py ${SOURCE_DIR}/backend/diffusion/model.py
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/1
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/config.pbtxt  ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/config.pbtxt
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_1_5/1/.gitkeep  ${SOURCE_DIR}/diffusion-models/stable_diffusion_1_5/1/.gitkeep
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/1
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/config.pbtxt  ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/config.pbtxt
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/diffusion-models/stable_diffusion_xl/1/.gitkeep  ${SOURCE_DIR}/diffusion-models/stable_diffusion_xl/1/.gitkeep
+    $RUN_PREFIX mkdir -p ${SOURCE_DIR}/scripts/stable_diffusion
+    $RUN_PREFIX cp $SOURCE_DIR/../Popular_Models_Guide/StableDiffusion/scripts/build_models* ${SOURCE_DIR}/scripts/stable_diffusion/
 
 fi
 
@@ -231,25 +227,14 @@ $RUN_PREFIX docker build -f $DOCKERFILE $BUILD_OPTIONS $BUILD_ARGS -t $TAG $SOUR
 { set +x; } 2>/dev/null
 
 
-if [[ $FRAMEWORK == TRT_LLM ]]; then
-    if [ -z "$RUN_PREFIX" ]; then
-	set -x
-    fi
-
-    $RUN_PREFIX docker build -f $SOURCE_DIR/docker/Dockerfile.trt-llm-engine-builder  $BUILD_OPTIONS $BUILD_ARGS -t trt-llm-engine-builder  $SOURCE_DIR $NO_CACHE
-
-    { set +x; } 2>/dev/null
-
-fi;
-
 if [[ $FRAMEWORK == IDENTITY ]] || [[ $BUILD_MODELS == TRUE ]]; then
 
     if [[ $FRAMEWORK == DIFFUSION ]]; then
 	if [ -z "$RUN_PREFIX" ]; then
 	    set -x
 	fi
 
-	$RUN_PREFIX docker run --rm -it -v $PWD:/workspace $TAG /bin/bash -c "/workspace/scripts/stable_diffusion/build_models.sh --model stable_diffusion_1_5"
+	$RUN_PREFIX docker run --gpus all --rm -it -v ${SOURCE_DIR}:/workspace $TAG /bin/bash -c "/workspace/scripts/stable_diffusion/build_models.sh --model stable_diffusion_xl"
 
 	{ set +x; } 2>/dev/null
     fi

diff --git a/Triton_Inference_Server_Python_API/deps/requirements.txt b/Triton_Inference_Server_Python_API/deps/requirements.txt
@@ -24,14 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-awscli
-fastapi==0.97.0
-ftfy
-mypy
 pyright
 pytest
-ray[all]==2.9
-scipy
-sphinx
-sphinx-markdown-builder
-starlette==0.27.0
+ray[all]==2.36.0
diff --git a/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl b/Triton_Inference_Server_Python_API/deps/tritonserver-2.41.0.dev0-py3-none-any.whl
diff --git a/Triton_Inference_Server_Python_API/docker/Dockerfile b/Triton_Inference_Server_Python_API/docker/Dockerfile
@@ -25,37 +25,27 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_IMAGE_TAG=24.01-py3
+ARG BASE_IMAGE_TAG=24.08-py3
 
 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as triton-python-api
 
 RUN apt-get update; apt-get install -y gdb
 
-COPY ./deps/requirements.txt /tmp/requirements.txt
-
-RUN pip install --timeout=2000 -r /tmp/requirements.txt
+RUN --mount=type=bind,source=./deps/requirements.txt,target=/tmp/requirements.txt \
+    pip install --timeout=2000 --requirement /tmp/requirements.txt
 
 # Finish pyright install
 
 RUN pyright --help
 
-COPY ./deps/tritonserver-2.41.0.dev0-py3-none-any.whl /tmp/tritonserver-2.41.0.dev0-py3-none-any.whl
-
 RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
-    "tritonserver-*.whl" | xargs -I {} pip3 install --force-reinstall --upgrade {}[all]
+    "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
 
-RUN pip3 show tritonserver 1>/dev/null || \
-    if [ $? != 0 ]; then \
-       pip3 install /tmp/tritonserver-2.41.0.dev0-py3-none-any.whl[all] ;\
-    fi
+# grafana
+RUN apt-get install -y adduser libfontconfig1 musl && \
+    wget https://dl.grafana.com/enterprise/release/grafana-enterprise_11.2.0_amd64.deb && \
+    dpkg -i grafana-enterprise_11.2.0_amd64.deb && \
+    rm -rf grafana-enterprise_11.2.0_amd64.deb
 
 RUN ln -sf /bin/bash /bin/sh
 
-COPY . /workspace
-
-ARG RUN_TESTS=FALSE
-
-RUN if [[ "$RUN_TESTS" == "TRUE" ]] ; then cd /tmp && git clone -b r23.12-python-api https://github.com/triton-inference-server/core.git && cp -rf /tmp/core/python/test /workspace/deps/ ; fi
-
-RUN if [[ "$RUN_TESTS" == "TRUE" ]] ; then pytest /workspace/deps ; fi
-