Include two different stages for building TGI image: (#34)

* Include two different stages for building TGI image: - Default standalone image - Inference Endpoint specific image * Include a make rule for tpu-tgi-ie * Style * Fix not using the latest layer correctly * Using the exact same as raw TGI * Update documentation * (docs) Remove specific optimum-tpu version mention in the doc and use latest * (docs) Use $MODEL_ID to specify the model when starting TGI container
huggingface · May 3, 2024 · c9937a9 · c9937a9
1 parent 062180f
commit c9937a9
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 19 deletions.
diff --git a/Makefile b/Makefile
@@ -43,12 +43,20 @@ clean:
 	rm -rf dist
 
 tpu-tgi:
-	docker build --rm -f text-generation-inference/Dockerfile \
+	docker build --rm -f text-generation-inference/docker/Dockerfile \
 	             --build-arg VERSION=$(VERSION) \
 	             --build-arg TGI_VERSION=$(TGI_VERSION) \
 				 -t huggingface/optimum-tpu:$(VERSION)-tgi .
 	docker tag huggingface/optimum-tpu:$(VERSION)-tgi huggingface/optimum-tpu:latest
 
+tpu-tgi-ie:
+	docker build --rm -f text-generation-inference/docker/Dockerfile \
+				 --target inference-endpoints \
+	             --build-arg VERSION=$(VERSION) \
+	             --build-arg TGI_VERSION=$(TGI_VERSION) \
+				 -t huggingface/optimum-tpu:$(VERSION)-tgi .
+	docker tag huggingface/optimum-tpu:$(VERSION)-tgi huggingface/optimum-tpu:latest-ie
+
 # Run code quality checks
 style_check:
 	ruff .

diff --git a/docs/source/howto/serving.mdx b/docs/source/howto/serving.mdx
@@ -17,20 +17,20 @@ Optimum-TPU provides a `make tpu-tgi` command at the root level to help you crea
 ### Docker Container Run
 
 ```
-OPTIMUM_TPU_VERSION=0.1.0b1
-docker run -p 8080:80 \
-       --net=host --privileged \
-       -v $(pwd)/data:/data \
-       -e HF_TOKEN=${HF_TOKEN} \
-       -e HF_BATCH_SIZE=1 \
-       -e HF_SEQUENCE_LENGTH=1024 \
-       huggingface/optimum-tpu:${OPTIMUM_TPU_VERSION}-tgi \
-       --model-id google/gemma-2b \
-       --max-concurrent-requests 4 \
-       --max-input-length 512 \
-       --max-total-tokens 1024 \
-       --max-batch-prefill-tokens 512 \
-       --max-batch-total-tokens 1024
+HF_TOKEN=<your_hf_token_here>
+MODEL_ID=google/gemma-2b
+
+sudo docker run -p 8080:80 \
+                --net=host \
+                --privileged \
+                -v $(pwd)/data:/data \
+                -e HF_TOKEN=${HF_TOKEN} \
+                huggingface/optimum-tpu:latest \
+                --model-id ${MODEL_ID} \
+                --max-concurrent-requests 4 \
+                --max-input-length 32 \
+                --max-total-tokens 64 \
+                --max-batch-size 1
 ```
 
 ### Executing requests against the service

diff --git a/text-generation-inference/Dockerfile → text-generation-inference/docker/Dockerfile b/text-generation-inference/Dockerfile → text-generation-inference/docker/Dockerfile
@@ -119,10 +119,17 @@ COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/
 COPY --from=pyserver /pyserver/build/dist dist
 RUN pip install dist/text_generation_server*.tar.gz
 
-# TPU compatible image
-FROM tpu_base
 
-COPY text-generation-inference/entrypoint.sh entrypoint.sh
+# TPU compatible image for Inference Endpoints
+FROM tpu-base as inference-endpoint
+
+COPY text-generation-inference/docker/entrypoint.sh entrypoint.sh
 RUN chmod +x entrypoint.sh
 
-ENTRYPOINT ["./entrypoint.sh"]
+ENTRYPOINT ["./entrypoint.sh"]
+
+# TPU compatible image
+FROM tpu_base
+
+ENTRYPOINT ["text-generation-launcher"]
+CMD ["--json-output"]
diff --git a/text-generation-inference/entrypoint.sh → ...generation-inference/docker/entrypoint.sh b/text-generation-inference/entrypoint.sh → ...generation-inference/docker/entrypoint.sh